qdrant-client icon indicating copy to clipboard operation
qdrant-client copied to clipboard

Cant upload bulk data to local qdrant client.

Open RaihanulHaque opened this issue 8 months ago • 1 comments

def __init__(self, dataset_path="movies_metadata.csv", sample_size=1000):
        # Connect to Qdrant Cloud
        self.client = QdrantClient(
            url=os.getenv("QDRANT_URL_LOCAL"),
            timeout=300
        )
        self.model = SentenceTransformer('all-MiniLM-L6-v2', device='mps')  # Adjust device if needed
        self.collection_name = "movies_large"

        # Load and preprocess movie dataset
        self._load_dataset(dataset_path, sample_size)
        self._initialize_collection()

    def _load_dataset(self, dataset_path, sample_size=1000):
        """Load and sample movie data from CSV."""
        

    def _initialize_collection(self):
        """Create or reset the collection and upload data."""
        if self.client.collection_exists(self.collection_name):
            self.client.delete_collection(self.collection_name)
            print(f"Collection '{self.collection_name}' deleted.")

        self.client.create_collection(
            collection_name=self.collection_name,
            vectors_config=VectorParams(size=384, distance=Distance.COSINE),
            optimizers_config=models.OptimizersConfigDiff(
                indexing_threshold=0,
            ),
            shard_number=2,
        )

        points = [
            PointStruct(
                id=idx,
                vector=vector,
                payload={"title": movie["title"], "description": movie["description"]}
            )
            for idx, (vector, movie) in enumerate(zip(self.vectors, self.movies))
        ]
        self.client.upsert(
            collection_name=self.collection_name,
            points=points,
        )
        print(f"Collection '{self.collection_name}' initialized with {len(points)} points.")

When I uploaded 1,000 points, it worked fine. But with 10,000 points, it gets stuck and doesn’t send the data to the client. I checked the bulk upload documentation but didn’t find anything helpful. My device has plenty of free RAM, so I don’t think it’s a memory issue.

RaihanulHaque avatar Mar 18 '25 09:03 RaihanulHaque

hi @RaihanulHaque

it's better to split your data in chunks

you can do it manually and write a loop with upsert on your own or use convenience methods like upload_collection or upload_points

joein avatar Mar 18 '25 10:03 joein