In [None]:
# Core libraries
!pip install torch torchvision transformers sentence-transformers

# Vector DBs
!pip install faiss-gpu
!pip install chromadb
!pip install weaviate-client

In [None]:
!pip install faiss-cpu

In [None]:
from datasets import load_dataset

dataset = load_dataset("jxie/flickr8k", split="train")
print(dataset[0])

In [6]:
import torch
import numpy as np
from transformers import CLIPProcessor, CLIPModel

# Load CLIP model + processor
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Prepare captions (flatten all 5 per image into a list)
all_captions = []
for row in dataset:
    all_captions.extend([
        row["caption_0"],
        row["caption_1"],
        row["caption_2"],
        row["caption_3"],
        row["caption_4"],
    ])

# Batch function for images
def embed_images(images, batch_size=32):
    all_embeds = []
    for i in range(0, len(images), batch_size):
        batch = images[i:i+batch_size]
        inputs = processor(images=batch, return_tensors="pt").to(device)
        with torch.no_grad():
            embeds = model.get_image_features(inputs["pixel_values"])
        all_embeds.append(embeds.cpu())
    return torch.cat(all_embeds, dim=0).numpy()

# Batch function for texts
def embed_texts(texts, batch_size=64):
    all_embeds = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = processor(text=batch, return_tensors="pt", padding=True, truncation=True).to(device)
        with torch.no_grad():
            embeds = model.get_text_features(**inputs)
        all_embeds.append(embeds.cpu())
    return torch.cat(all_embeds, dim=0).numpy()

# Run embeddings
print("Encoding images...")
image_embeddings = embed_images(dataset["image"], batch_size=32)   # Shape: (8000, 512)

print("Encoding captions...")
text_embeddings = embed_texts(all_captions, batch_size=64)        # Shape: (40000, 512)

print("Image embeddings shape:", image_embeddings.shape)
print("Text embeddings shape:", text_embeddings.shape)



config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Encoding images...
Encoding captions...
Image embeddings shape: (6000, 512)
Text embeddings shape: (30000, 512)


In [7]:
print("Images:", len(dataset["image"]))
print("Captions:", len(all_captions))


Images: 6000
Captions: 30000


In [8]:
import faiss
import numpy as np
import time

# Load saved embeddings (if needed)
# image_embeddings = np.load("image_embeddings.npy")
# text_embeddings = np.load("text_embeddings.npy")

# Normalize vectors (important for cosine similarity)
def normalize(vectors):
    faiss.normalize_L2(vectors)
    return vectors

image_embeddings = normalize(image_embeddings.astype("float32"))
text_embeddings = normalize(text_embeddings.astype("float32"))

# Build FAISS index
d = image_embeddings.shape[1]   # embedding dimension (512)
index = faiss.IndexFlatIP(d)    # Inner Product (cosine similarity after normalization)

# Add embeddings
start = time.time()
index.add(image_embeddings)     # add 6000 image embeddings
end = time.time()
print(f"FAISS Ingestion Time: {end - start:.3f} seconds")
print("Total vectors in index:", index.ntotal)

# Run queries
k = 5
queries = text_embeddings[:100]

start = time.time()
D, I = index.search(queries, k)   # distances (D), indices (I)
end = time.time()

print(f"Queried {len(queries)} embeddings in {end - start:.3f} seconds")
print("Avg latency per query:", (end - start)/len(queries) * 1000, "ms")
print("Shape of results:", I.shape)
print("Sample result indices for query[0]:", I[0])


FAISS Ingestion Time: 0.004 seconds
Total vectors in index: 6000
Queried 100 embeddings in 0.019 seconds
Avg latency per query: 0.19229650497436523 ms
Shape of results: (100, 5)
Sample result indices for query[0]: [   0 3473 1014 3678  112]


In [9]:
import faiss
import numpy as np
import time

# Load saved embeddings (if needed)
# image_embeddings = np.load("image_embeddings.npy")
# text_embeddings = np.load("text_embeddings.npy")

# Normalize vectors (important for cosine similarity)
def normalize(vectors):
    faiss.normalize_L2(vectors)
    return vectors

image_embeddings = normalize(image_embeddings.astype("float32"))
text_embeddings = normalize(text_embeddings.astype("float32"))

# Build FAISS IVF index
d = image_embeddings.shape[1]       # embedding dimension (512)
nlist = 100                         # number of clusters (tuneable hyperparameter)

# Quantizer is a Flat index used inside IVF
quantizer = faiss.IndexFlatIP(d)
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT)

# IMPORTANT: train the IVF index before adding vectors
index.train(image_embeddings)
index.add(image_embeddings)

print("Total vectors in index:", index.ntotal)


# --- 2. Run queries ---
k = 5   # top-k neighbors to retrieve
queries = text_embeddings[:100]   # take 100 caption embeddings as queries for benchmarking

start = time.time()
D, I = index.search(queries, k)   # distances (D), indices (I)
end = time.time()

print(f"Queried {len(queries)} embeddings in {end - start:.3f} seconds")
print("Avg latency per query:", (end - start)/len(queries) * 1000, "ms")
print("Shape of results:", I.shape)
print("Sample result indices for query[0]:", I[0])

Total vectors in index: 6000
Queried 100 embeddings in 0.001 seconds
Avg latency per query: 0.013058185577392578 ms
Shape of results: (100, 5)
Sample result indices for query[0]: [   0 3473 1014 3678  112]


In [12]:
def recall_at_k(index_true, index_test, k=5):
    """
    index_true: ground-truth neighbors (from Flat)
    index_test: neighbors from IVF or other index
    """
    hits = 0
    for i in range(index_true.shape[0]):
        hits += len(set(index_true[i, :k]) & set(index_test[i, :k]))
    return hits / (index_true.shape[0] * k)

# Run Flat (ground truth)
flat_index = faiss.IndexFlatIP(d)
flat_index.add(image_embeddings)
_, I_flat = flat_index.search(text_embeddings[:100], 5)

# Run IVF
index.nprobe = 15
_, I_ivf = index.search(text_embeddings[:100], 5)

# Compare recall
recall = recall_at_k(I_flat, I_ivf, k=5)
print("Recall@5 (IVF vs Flat), nprobe=15:", recall)

Recall@5 (IVF vs Flat), nprobe=15: 0.94


In [13]:
import faiss
import time

# Build HNSW index
d = image_embeddings.shape[1]   # embedding dimension (512)
m = 32                          # number of neighbors per node in graph (tunable)
index_hnsw = faiss.IndexHNSWFlat(d, m, faiss.METRIC_INNER_PRODUCT)

# Add vectors
start = time.time()
index_hnsw.add(image_embeddings)
end = time.time()
print(f"HNSW Ingestion Time: {end - start:.3f} seconds")
print("Total vectors in HNSW index:", index_hnsw.ntotal)

# Query
k = 5
queries = text_embeddings[:100]

start = time.time()
D_hnsw, I_hnsw = index_hnsw.search(queries, k)
end = time.time()

latency = (end - start) / len(queries) * 1000  # ms/query
print(f"HNSW queried {len(queries)} embeddings")
print("Avg latency per query:", latency, "ms")

# Recall vs Flat baseline
recall_hnsw = recall_at_k(I_flat, I_hnsw, k=5)
print("Recall@5 (HNSW vs Flat):", recall_hnsw)

HNSW Ingestion Time: 0.618 seconds
Total vectors in HNSW index: 6000
HNSW queried 100 embeddings
Avg latency per query: 0.05462646484375 ms
Recall@5 (HNSW vs Flat): 0.918


In [16]:
import time
import numpy as np

# Create new collection for captions
collection = client.create_collection(name="cifar_embeddings_chroma")

#  Helper: chunked ingestion
def batch_add(collection, embeddings, ids, metadatas=None, batch_size=5000):
    for i in range(0, len(embeddings), batch_size):
        batch_embeds = embeddings[i:i+batch_size]
        batch_ids = ids[i:i+batch_size]
        batch_metas = None
        if metadatas is not None:
            batch_metas = metadatas[i:i+batch_size]
        collection.add(
            embeddings=batch_embeds.tolist(),
            ids=batch_ids,
            metadatas=batch_metas
        )

# Prepare caption embeddings
ids = [f"cap_{i}" for i in range(len(text_embeddings))]
metadatas = [{"type": "caption"} for _ in range(len(text_embeddings))]

# Ingest captions (30k vectors)
start = time.time()
batch_add(collection, text_embeddings, ids, metadatas, batch_size=5000)
end = time.time()

print(f"Chroma Ingestion Time (captions): {end - start:.3f} seconds")
print("Total vectors in cifar_embeddings_chroma:", collection.count())

# Query captions (self-search for demo)
queries = text_embeddings[:100]   # take first 100 captions
start = time.time()
results = collection.query(
    query_embeddings=queries.tolist(),
    n_results=5
)
end = time.time()

latency = (end - start) / len(queries) * 1000  # ms/query
print(f"Chroma queried {len(queries)} captions")
print("Avg latency per query:", latency, "ms")

# Recall@5 vs Flat baseline
I_chroma = np.array([[int(x.split("_")[1]) for x in row] for row in results["ids"]])
recall_chroma = recall_at_k(I_flat, I_chroma, k=5)
print("Recall@5 (Chroma vs Flat):", recall_chroma)


Chroma Ingestion Time (captions): 14.386 seconds
Total vectors in cifar_embeddings_chroma: 30000
Chroma queried 100 captions
Avg latency per query: 0.7642173767089844 ms
Recall@5 (Chroma vs Flat): 0.002


In [26]:
import weaviate
from weaviate.classes.config import DataType

# Connect to existing local instance
client = weaviate.connect_to_local(port=8079, grpc_port=50060)

# Drop old collection if exists
try:
    client.collections.delete("CifarEmbedding")
except:
    pass

# Create new collection
cifar_collection = client.collections.create(
    name="CifarEmbedding",
    vectorizer_config=None,  # we're supplying vectors
    properties=[
        {"name": "caption", "data_type": DataType.TEXT}
    ]
)

print("Collection created")

# Ingest sample embeddings
import time
start = time.time()
with client.batch.dynamic() as batch:
    for i, emb in enumerate(text_embeddings[:5000]):  # test with 5k first
        batch.add_object(
            properties={"caption": f"caption {i}"},
            collection="CifarEmbedding",
            vector=emb.tolist()
        )
end = time.time()
print(f"Weaviate Ingestion Time (5k captions): {end - start:.3f} seconds")

# Query
queries = text_embeddings[:10]
start = time.time()
results = []
for q in queries:
    res = cifar_collection.query.near_vector(
        near_vector=q.tolist(),
        return_properties=["caption"],
        limit=5
    )
    ids = [obj.uuid for obj in res.objects]
    results.append(ids)
end = time.time()

latency = (end - start) / len(queries) * 1000
print(f"Weaviate queried {len(queries)} captions")
print("Avg latency per query:", latency, "ms")

Collection created
Weaviate Ingestion Time (5k captions): 3.566 seconds
Weaviate queried 10 captions
Avg latency per query: 1.819157600402832 ms


In [27]:
import time
import numpy as np

# Example: use your flat_index or hnsw_index here
faiss_index = flat_index   # or index_ivf / index_hnsw

queries = text_embeddings[:1000]  # 1000 queries

start = time.time()
for q in queries:
    _ = faiss_index.search(q.reshape(1,-1).astype("float32"), 5)
end = time.time()

qps_faiss = len(queries) / (end - start)
print(f"FAISS QPS: {qps_faiss:.2f}")

FAISS QPS: 1929.94


In [28]:
queries = text_embeddings[:1000]

start = time.time()
for q in queries:
    _ = collection.query(
        query_embeddings=[q.tolist()],
        n_results=5
    )
end = time.time()

qps_chroma = len(queries) / (end - start)
print(f"Chroma QPS: {qps_chroma:.2f}")

Chroma QPS: 719.01


In [29]:
queries = text_embeddings[:1000]

start = time.time()
for q in queries:
    _ = cifar_collection.query.near_vector(
        near_vector=q.tolist(),
        return_properties=["caption"],
        limit=5
    )
end = time.time()

qps_weaviate = len(queries) / (end - start)
print(f"Weaviate QPS: {qps_weaviate:.2f}")

Weaviate QPS: 598.40


In [30]:
import time
import numpy as np
import faiss

d = image_embeddings.shape[1]  # 512
faiss_index = faiss.IndexFlatIP(d)

for size in [5000, 10000, 20000]:
    subset = image_embeddings[:size].astype("float32")
    start = time.time()
    faiss_index.add(subset)
    end = time.time()
    print(f"FAISS ingestion {size} vectors: {end-start:.3f}s")

FAISS ingestion 5000 vectors: 0.007s
FAISS ingestion 10000 vectors: 0.015s
FAISS ingestion 20000 vectors: 0.024s


In [32]:
import chromadb
import time

chroma_client = chromadb.Client()

# Create or reset collection
try:
    chroma_client.delete_collection("chroma_ingest_test")
except:
    pass

chroma_coll = chroma_client.create_collection(name="chroma_ingest_test")

# Helper for batch add
def chroma_batch_add(coll, embeddings, batch_size=5000):
    ids = [str(i) for i in range(len(embeddings))]
    for i in range(0, len(embeddings), batch_size):
        coll.add(
            embeddings=embeddings[i:i+batch_size].tolist(),
            ids=ids[i:i+batch_size],
            metadatas=[{"source": "bench"}] * len(embeddings[i:i+batch_size])
        )

# Benchmark different sizes
for size in [5000, 10000, 20000]:
    try:
        chroma_client.delete_collection("chroma_ingest_test")
    except:
        pass
    chroma_coll = chroma_client.create_collection(name="chroma_ingest_test")

    start = time.time()
    chroma_batch_add(chroma_coll, image_embeddings[:size])
    end = time.time()
    print(f"Chroma ingestion {size} vectors: {end-start:.3f}s")

Chroma ingestion 5000 vectors: 1.636s
Chroma ingestion 10000 vectors: 2.364s
Chroma ingestion 20000 vectors: 2.806s


In [33]:
from weaviate.classes.config import DataType

# Reset collection for benchmarking
try:
    client.collections.delete("IngestBench")
except:
    pass

ingest_coll = client.collections.create(
    name="IngestBench",
    vectorizer_config=None,
    properties=[{"name": "caption", "data_type": DataType.TEXT}]
)

for size in [5000, 10000, 20000]:
    # delete + recreate for fresh timing
    try:
        client.collections.delete("IngestBench")
    except:
        pass
    ingest_coll = client.collections.create(
        name="IngestBench",
        vectorizer_config=None,
        properties=[{"name": "caption", "data_type": DataType.TEXT}]
    )

    start = time.time()
    with client.batch.dynamic() as batch:
        for i, emb in enumerate(image_embeddings[:size]):
            batch.add_object(
                properties={"caption": f"img {i}"},
                collection="IngestBench",
                vector=emb.tolist()
            )
    end = time.time()
    print(f"Weaviate ingestion {size} vectors: {end-start:.3f}s")


Weaviate ingestion 5000 vectors: 4.059s
Weaviate ingestion 10000 vectors: 3.045s
Weaviate ingestion 20000 vectors: 4.000s
