lance
lance copied to clipboard
Performance test of file reading and using lance to read images
1. Test code
import os
import argparse
import pyarrow as pa
import lance
import time
from tqdm import tqdm
from pycocotools.coco import COCO
import os.path as osp
import numpy as np
from PIL import Image
def process_images_detect(images_folder, split, schema, ann_file):
coco = COCO(ann_file)
images = coco.loadImgs(coco.getImgIds())
images2id = {}
for im_ann in images:
images2id[im_ann["file_name"]] = im_ann["id"]
image2ann = coco.imgToAnns
for image_file in tqdm(
os.listdir(images_folder), total=len(os.listdir(images_folder))
):
if ".jpg" not in image_file:
continue
im_id = images2id[image_file]
bboxes = []
catids = []
for ann in image2ann[im_id]:
bboxes.append(ann["bbox"])
catids.append(ann["category_id"])
with open(osp.join(images_folder, image_file), "rb") as f:
im = f.read()
image_array = pa.array([im], type=pa.binary())
filename_array = pa.array([str(image_file)], type=pa.string())
bboxes_array = pa.array([np.asarray(bboxes).tobytes()], type=pa.binary())
catid_array = pa.array([np.asarray(catids).tobytes()], type=pa.binary())
labels = pa.array([image_file], type=pa.string())
# Yield RecordBatch for each image
yield pa.RecordBatch.from_arrays(
[image_array, filename_array, bboxes_array, catid_array, labels],
schema=schema,
)
# Function to write PyArrow Table to Lance dataset
def write_to_lance(data_folder, dataset_name, schema):
for split in ["train2017"]:
lance_file_path = os.path.join(data_folder, f"{dataset_name}_{split}.lance")
reader = pa.RecordBatchReader.from_batches(
schema,
process_images_detect(
osp.join(data_folder,"train2017"),
split,
schema,
osp.join(data_folder,"annotations/instances_train2017.json"),
),
)
lance.write_dataset(
reader,
lance_file_path,
schema,
)
def loading_into_pandas(images_folder, dataset_name):
data_frames = {} # Dictionary to store DataFrames for each data type
batch_size = args.batch_size
for split in ["train2017"]:
uri = os.path.join(images_folder, f"{dataset_name}_{split}.lance")
ds = lance.dataset(uri)
for batch in tqdm(
ds.to_batches(columns=["image", "filename"], batch_size=batch_size),
desc=f"Loading {split} batches",
):
batch.to_pandas()
return data_frames
def load_file(data_dir):
image_dir = osp.join(data_dir,"train2017")
files = [osp.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith(".jpg")]
for f in tqdm(files):
im = Image.open(f)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process image dataset.")
parser.add_argument(
"--batch_size", type=int, default=10, help="Batch size for processing images"
)
parser.add_argument("--dataset", type=str, help="Path to the image dataset folder")
parser.add_argument("--ann_file")
# try:
args = parser.parse_args()
dataset_path = args.dataset
if dataset_path is None:
raise ValueError(
"Please provide the path to the image dataset folder using the --dataset argument."
)
# Extract dataset name
dataset_name = os.path.basename(dataset_path)
schema = pa.schema(
[
pa.field("image", pa.binary()),
pa.field("filename", pa.string()),
pa.field("bbox", pa.binary()),
pa.field("catid", pa.binary()),
pa.field("label", pa.string()),
]
)
# write_to_lance(dataset_path, dataset_name, schema)
start = time.time()
data_frames = loading_into_pandas(dataset_path, dataset_name)
end = time.time()
print(f"Lancedb Time(sec): {end - start:.2f}")
start = time.time()
load_file(dataset_path)
end = time.time()
print(f"File Time(sec): {end - start:.2f}")
Test data
coco2017 training set images, a total of 118,287 images
Software and hardware information
pyarrow==15.0.0 pydantic==2.7.1 lancedb==0.10.1 pylance==0.14.1 numpy==1.26.3
Test results
When we compared the performance of lancedb and reading directly by file name, we found that lance reading on SSD is slower than file reading, while lancedb reading on HDD is faster and the difference is larger.batch_size is set to 16, The following are relevant screenshots of the test on two devices:
in SSD:
in HDD:
I would like to ask if this test result is reasonable, because this result is slightly different from this test result.And I want to know what caused this result, so that we can know in which cases to use lancedb in the future