Excerpt from release date:
The dot products you’ll receive are a bit more centered around 0.5. This might have been an engineered decision, but is similar to the performance of other provider’s models that have come in the following 26 months since the release of the embedding-3 models.
You will find that 0.4 is a pretty good threshold, depending on how specific the corpus and query is. Like any switch to another embeddings provider, or even between the “large” and “small” or when reducing their dimensions, if you are looking for non-correlation rejection in addition to a top-k maximum input, you’ll have to do your own tweaking per model and even per-application.
Oops, did I leave this here?
"""Demonstration of sentence_transformers embeddings with Jina v5 models (1GB/2GB VRAM),
structured as a basic vector search example as documentation, with procedural scripting.
### Getting Started Local
nVidia Kepler, Maxwell, and Pascal (+ Volta) GPU?
Example Pascal GPU: GeForce GTX 1050 2GB, Quadro P2000 5GB with video card drivers >=561.17, <580
Use: CUDA 12.6 Torch: e.g. '2.10.0+cu126' if not Blackwell+
`pip install torch==2.9.1 torchvision==0.24.1 torchaudio==2.9.1 sentence_transformers xformers --index-url https://download.pytorch.org/whl/cu126`
(held back to torch 2.9 for other ML projects you may encounter)
"""
import time
from time import monotonic as now; s=now()
import os
os.environ["HF_HUB_OFFLINE"] = "1" # ensure updates happen only when you want
os.environ["TRANSFORMERS_OFFLINE"] = "1"
import numpy as np
import torch
from sentence_transformers import SentenceTransformer, SimilarityFunction
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
print(f"[{now()-s:.4f}] imports done")
if not torch.cuda.is_available():
raise RuntimeError("CUDA not available. Not attempting CPU!")
LARGE_MODEL = "jinaai/jina-embeddings-v5-text-small" # model.max_seq_length 32768
TRANSFORMERS_MODEL = "jinaai/jina-embeddings-v5-text-nano" # model.max_seq_length 8192
model = SentenceTransformer(
TRANSFORMERS_MODEL,
device="cuda",
config_kwargs={
"normalize_embeddings": True,
},
model_kwargs={
"dtype": torch.float16, # use bfloat16 and flash attn2 on Blackwell+, try float32 performance also
"default_task": "retrieval"
},
trust_remote_code=True,
local_files_only=True, # comment out when first run to download from HF
similarity_fn_name=SimilarityFunction.DOT_PRODUCT,
)
print(f"[{now()-s:.4f}] Model loaded")
def clean_model(model_obj):
""" Unload the model and free memory"""
del model_obj
import gc
gc.collect() # Explicitly run Python garbage collector
torch.cuda.empty_cache() # Clear cached GPU memory
### --- DEMO DATA ----
queries = [
"Best local embeddings models to compete with OpenAI?",
]
structured_docs = [
{
"metadata": {"filename": "doc1.txt", "chunk_number": 1},
"text": "Considered a top-tier open-source model, BGE-M3 and its versions often rank highly on benchmarks and offer a cost-effective alternative to OpenAI's models for local execution.",
},
{
"metadata": {"filename": "doc1.txt", "chunk_number": 2},
"text": "Jina's models offer innovation in open-source text embeddings and can compete with proprietary models on various tasks, including multilingual ones",
},
{
"metadata": {"filename": "doc2.txt", "chunk_number": 1},
"text": "Check leaderboards like the Massive Text Embedding Benchmark (MTEB) to see how models perform on different tasks, such as general text or specific domains",
},
{
"metadata": {"filename": "doc2.txt", "chunk_number": 2},
"text": """Jasper and Stella: distillation of SOTA embedding models
We propose a novel multi-stage distillation framework that enables a smaller student embedding model to distill multiple larger teacher embedding models.""",
},
{
"metadata": {"filename": "distracton.txt", "chunk_number": 1},
"text": """Stella Artois to branch and embed deeply in the distilled spirits vertical.""",
},
]
## -- GPU-powered search exampl3z -- ##
CUTOFF_THRESHOLD = 0.2
top_k = min(5, len(structured_docs))
print(f"[{(s:=now())-s:.4f}] Encoding documents")
doc_texts = [doc["text"] for doc in structured_docs]
doc_embeddings = model.encode(
sentences=doc_texts,
convert_to_tensor=True,
).to("cuda")
print(f"[{now()-s:.4f}] Encoded {len(doc_texts)} docs, shape: {doc_embeddings.shape}")
vector_database = [
{"text": doc["text"], "metadata": doc["metadata"], "embedding": doc_embeddings[i]}
for i, doc in enumerate(structured_docs)
]
query_text_for_search = queries[0]
print(f"[{now()-s:.4f}] Encoding query: '{query_text_for_search}'")
query_embedding = model.encode(
sentences=[query_text_for_search],
convert_to_tensor=True,
task="retrieval",
prompt_name="query",
).to("cuda")
print(f"[{now()-s:.4f}] Query encoded")
# ✅ Stack individual doc tensors → (N, dim) matrix, then compare against query
db_embeddings_tensor = torch.stack([doc["embedding"] for doc in vector_database]) # (N, dim)
similarities_tensor = model.similarity(query_embedding, db_embeddings_tensor) # (1, N)
similarities_list = similarities_tensor[0].tolist() # back to plain list
search_results = sorted(
[
{"score": similarities_list[i], "text": entry["text"], "metadata": entry["metadata"]}
for i, entry in enumerate(vector_database)
],
key=lambda x: x["score"],
reverse=True,
)
print(f"[{now()-s:.4f}] Search completed")
print(f"\n--- Search Results (Threshold: {CUTOFF_THRESHOLD:.2f}) ---")
rank, found = 1, False
for result in search_results[:top_k]:
if result["score"] >= CUTOFF_THRESHOLD:
found = True
truncated = result["text"][:80] + "..." if len(result["text"]) > 80 else result["text"]
print(f"Rank {rank}: Score {result['score']:.4f}")
print(f" Metadata: {result['metadata']}")
print(f" Text: {truncated}")
print("-" * 20)
rank += 1
if not found:
print("No results found above the specified threshold.")
This could be useful, also:
def count_tokens(model, text):
tokenizer = model._first_module().tokenizer
encoded_inputs = tokenizer(
[text],
padding=True,
truncation=True,
max_length=model.max_seq_length, # Use the model's default max length (e.g., 256 or 512 tokens)
return_tensors='pt' # Return PyTorch tensors
)
token_lengths = encoded_inputs['input_ids'].shape[1]
print(f"Max Sequence Length: {model.max_seq_length}")
print(f"Tokenized Sequence Length (with padding): {token_lengths}")
print(f"[@{now()-t0:.3f}] model ready on {device}")
return token_lengths