Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/docs.json
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
"pages": [
"tables/index",
"tables/create",
"tables/multimodal",
"tables/schema",
"tables/update",
"tables/versioning",
Expand Down
18 changes: 18 additions & 0 deletions docs/snippets/multimodal.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{/* Auto-generated by scripts/mdx_snippets_gen.py. Do not edit manually. */}

export const PyBlobApiIngest = "import lancedb\nimport lance\n\ndb = lancedb.connect(db_path_factory(\"blob_db\"))\n \n# Create sample data\ndata = [\n {\"id\": 1, \"video\": b\"fake_video_bytes_1\"},\n {\"id\": 2, \"video\": b\"fake_video_bytes_2\"}\n]\n \n# Create the table\ntbl = db.create_table(\"videos\", data=data, schema=schema)\n";

export const PyBlobApiSchema = "import pyarrow as pa\n\n# Define schema with Blob API metadata for lazy loading\nschema = pa.schema([\n pa.field(\"id\", pa.int64()),\n pa.field(\n \"video\", \n pa.large_binary(), \n metadata={\"lance-encoding:blob\": \"true\"} # Enable Blob API\n ),\n])\n";

export const PyCreateDummyData = "# Create some dummy images\ndef create_dummy_image(color):\n img = Image.new('RGB', (100, 100), color=color)\n buf = io.BytesIO()\n img.save(buf, format='PNG')\n return buf.getvalue()\n\n# Create dataset with metadata, vectors, and image blobs\ndata = [\n {\n \"id\": 1,\n \"filename\": \"red_square.png\",\n \"vector\": np.random.rand(128).astype(np.float32),\n \"image_blob\": create_dummy_image('red'),\n \"label\": \"red\"\n },\n {\n \"id\": 2,\n \"filename\": \"blue_square.png\",\n \"vector\": np.random.rand(128).astype(np.float32),\n \"image_blob\": create_dummy_image('blue'),\n \"label\": \"blue\"\n }\n]\n";

export const PyDefineSchema = "# Define schema explictly to ensure image_blob is treated as binary\nschema = pa.schema([\n pa.field(\"id\", pa.int32()),\n pa.field(\"filename\", pa.string()),\n pa.field(\"vector\", pa.list_(pa.float32(), 128)),\n pa.field(\"image_blob\", pa.binary()), # Important: Use pa.binary() for blobs\n pa.field(\"label\", pa.string())\n])\n";

export const PyIngestData = "tbl = db.create_table(\"images\", data=data, schema=schema, mode=\"overwrite\")\n";

export const PyMultimodalImports = "import lancedb\nimport pyarrow as pa\nimport pandas as pd\nimport numpy as np\nimport io\nfrom PIL import Image\n";

export const PyProcessResults = "# Convert back to PIL Image\nfor _, row in results.iterrows():\n image_bytes = row['image_blob']\n image = Image.open(io.BytesIO(image_bytes))\n print(f\"Retrieved image: {row['filename']}, Size: {image.size}\")\n # You can now use 'image' with other libraries or display it\n";

export const PySearchData = "# Search for similar images\nquery_vector = np.random.rand(128).astype(np.float32)\nresults = tbl.search(query_vector).limit(1).to_pandas()\n";

4 changes: 2 additions & 2 deletions docs/snippets/quickstart.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ export const PyQuickstartVectorSearch1 = "# Let's search for vectors similar to

export const PyQuickstartVectorSearch2 = "# Let's search for vectors similar to \"wizard\"\nquery_vector = [0.7, 0.3, 0.5]\n\nresults = table.search(query_vector).limit(2).to_polars()\nprint(results)\n";

export const TsQuickstartOutputPandas = "result = await table.search(queryVector).limit(2).toArray();\n";

export const TsQuickstartAddData = "const moreData = [\n { id: \"7\", text: \"mage\", vector: [0.6, 0.3, 0.4] },\n { id: \"8\", text: \"bard\", vector: [0.3, 0.8, 0.4] },\n];\n\n// Add data to table\nawait table.add(moreData);\n";

export const TsQuickstartCreateTable = "const data = [\n { id: \"1\", text: \"knight\", vector: [0.9, 0.4, 0.8] },\n { id: \"2\", text: \"ranger\", vector: [0.8, 0.4, 0.7] },\n { id: \"9\", text: \"priest\", vector: [0.6, 0.2, 0.6] },\n { id: \"4\", text: \"rogue\", vector: [0.7, 0.4, 0.7] },\n];\nlet table = await db.createTable(\"adventurers\", data, { mode: \"overwrite\" });\n";
Expand All @@ -24,8 +26,6 @@ export const TsQuickstartOpenTable = "table = await db.openTable(\"adventurers\"

export const TsQuickstartOutputArray = "result = await table.search(queryVector).limit(2).toArray();\nconsole.table(result);\n";

export const TsQuickstartOutputPandas = "result = await table.search(queryVector).limit(2).toArray();\n";

export const TsQuickstartVectorSearch1 = "// Let's search for vectors similar to \"warrior\"\nlet queryVector = [0.8, 0.3, 0.8];\n\nlet result = await table.search(queryVector).limit(2).toArray();\nconsole.table(result);\n";

export const TsQuickstartVectorSearch2 = "// Let's search for vectors similar to \"wizard\"\nqueryVector = [0.7, 0.3, 0.5];\n\nconst results = await table.search(queryVector).limit(2).toArray();\nconsole.table(results);\n";
Expand Down
4 changes: 2 additions & 2 deletions docs/snippets/search.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ export const PyBasicHybridSearch = "data = [\n {\"text\": \"rebel spaceships

export const PyBasicHybridSearchAsync = "uri = \"data/sample-lancedb\"\nasync_db = await lancedb.connect_async(uri)\ndata = [\n {\"text\": \"rebel spaceships striking from a hidden base\"},\n {\"text\": \"have won their first victory against the evil Galactic Empire\"},\n {\"text\": \"during the battle rebel spies managed to steal secret plans\"},\n {\"text\": \"to the Empire's ultimate weapon the Death Star\"},\n]\nasync_tbl = await async_db.create_table(\"documents_async\", schema=Documents)\n# ingest docs with auto-vectorization\nawait async_tbl.add(data)\n# Create a fts index before the hybrid search\nawait async_tbl.create_index(\"text\", config=FTS())\ntext_query = \"flower moon\"\n# hybrid search with default re-ranker\nawait (await async_tbl.search(\"flower moon\", query_type=\"hybrid\")).to_pandas()\n";

export const PyClassDefinition = "class Metadata(BaseModel):\n source: str\n timestamp: datetime\n\n\nclass Document(BaseModel):\n content: str\n meta: Metadata\n\n\nclass LanceSchema(LanceModel):\n id: str\n vector: Vector(1536)\n payload: Document\n";

export const PyClassDocuments = "class Documents(LanceModel):\n vector: Vector(embeddings.ndims()) = embeddings.VectorField()\n text: str = embeddings.SourceField()\n";

export const PyClassDefinition = "class Metadata(BaseModel):\n source: str\n timestamp: datetime\n\n\nclass Document(BaseModel):\n content: str\n meta: Metadata\n\n\nclass LanceSchema(LanceModel):\n id: str\n vector: Vector(1536)\n payload: Document\n";

export const PyCreateTableAsyncWithNestedSchema = "# Let's add 100 sample rows to our dataset\ndata = [\n LanceSchema(\n id=f\"id{i}\",\n vector=np.random.randn(1536),\n payload=Document(\n content=f\"document{i}\",\n meta=Metadata(source=f\"source{i % 10}\", timestamp=datetime.now()),\n ),\n )\n for i in range(100)\n]\n\nasync_tbl = await async_db.create_table(\"documents_async\", data=data)\n";

export const PyCreateTableWithNestedSchema = "# Let's add 100 sample rows to our dataset\ndata = [\n LanceSchema(\n id=f\"id{i}\",\n vector=np.random.randn(1536),\n payload=Document(\n content=f\"document{i}\",\n meta=Metadata(source=f\"source{i % 10}\", timestamp=datetime.now()),\n ),\n )\n for i in range(100)\n]\n\n# Synchronous client\ntbl = db.create_table(\"documents\", data=data)\n";
Expand Down
123 changes: 123 additions & 0 deletions docs/tables/multimodal.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
---
title: Multimodal Data (Blobs)
sidebarTitle: "Working with multimodal data"
description: Learn how to store and query multimodal data (images, audio, video) directly in LanceDB using binary columns.
icon: "images"
keywords: ["blob", "large binary", "blobs", "multimodal"]
---

import {
PyMultimodalImports as MultimodalImports,
PyCreateDummyData as CreateDummyData,
PyDefineSchema as DefineSchema,
PyIngestData as IngestData,
PySearchData as SearchData,
PyProcessResults as ProcessResults,
PyBlobApiSchema as BlobApiSchema,
PyBlobApiIngest as BlobApiIngest,
} from '/snippets/multimodal.mdx';

LanceDB handles multimodal data—images, audio, video, and PDF files—natively by storing the raw bytes in a binary column alongside your vectors and metadata. This approach simplifies your data infrastructure by keeping the raw assets and their embeddings in the same database, eliminating the need for separate object storage for many use cases.

This guide demonstrates how to ingest, store, and retrieve image data using standard binary columns, and also introduces the **Lance Blob API** for optimized handling of larger multimodal files.

## Storing binary data

To store binary data, you need to use the `pa.binary()` data type in your Arrow schema. In Python, this corresponds to `bytes` objects if you're using LanceDB's Pydantic `LanceModel` to define the schema.

### 1. Setup and imports

First, let's import the necessary libraries. We'll use `PIL` (Pillow) for image handling and `io` for byte conversion.

<CodeGroup>
<CodeBlock filename="Python" language="Python" icon="python">
{MultimodalImports}
</CodeBlock>
</CodeGroup>

### 2. Preparing data

For this example, we'll create some dummy in-memory images. In a real application, you would read these from files or an API. The key is to convert your data (image, audio, etc.) into a raw `bytes` object.

<CodeGroup>
<CodeBlock filename="Python" language="Python" icon="python">
{CreateDummyData}
</CodeBlock>
</CodeGroup>

### 3. Defining the schema

When creating the table, it is **highly recommended** to define the schema explicitly. This ensures that your binary data is correctly interpreted as a `binary` type by Arrow/LanceDB and not as a generic string or list.

<CodeGroup>
<CodeBlock filename="Python" language="Python" icon="python">
{DefineSchema}
</CodeBlock>
</CodeGroup>

### 4. Ingesting data

Now, create the table using the data and the defined schema.

<CodeGroup>
<CodeBlock filename="Python" language="Python" icon="python">
{IngestData}
</CodeBlock>
</CodeGroup>

## Retrieving and using blobs

When you search your LanceDB table, you can retrieve the binary column just like any other metadata.

<CodeGroup>
<CodeBlock filename="Python" language="Python" icon="python">
{SearchData}
</CodeBlock>
</CodeGroup>

### Converting bytes back to objects

Once you have the `bytes` data back from the search result, you can decode it back into its original format (e.g., a PIL Image, an Audio buffer, etc.).

<CodeGroup>
<CodeBlock filename="Python" language="Python" icon="python">
{ProcessResults}
</CodeBlock>
</CodeGroup>

## Large Blobs (Blob API)

For larger files like high-resolution images or videos, Lance provides a specialized **Blob API**. By using `pa.large_binary()` and specific metadata, you enable **lazy loading** and optimized encoding. This allows you to work with massive datasets without loading all binary data into memory upfront.

### 1. Defining a blob schema

To use the Blob API, you must mark the column with `{"lance-encoding:blob": "true"}` metadata.

<CodeGroup>
<CodeBlock filename="Python" language="Python" icon="python">
{BlobApiSchema}
</CodeBlock>
</CodeGroup>

### 2. Ingesting large blobs

You can then ingest data normally, and Lance will handle the optimized storage.

<CodeGroup>
<CodeBlock filename="Python" language="Python" icon="python">
{BlobApiIngest}
</CodeBlock>
</CodeGroup>

<Card>
For more advanced usage, including random access and file-like reading of blobs, see the
Lance format's [blob API documentation](https://lance.org/guide/blob/).
</Card>

## Other modalities

The `pa.binary()` and `pa.large_binary()` types are universal. You can use this same pattern for other types of multimodal data:

- **Audio:** Read `.wav` or `.mp3` files as bytes.
- **Video:** Store video transitions or full clips using the Blob API.
- **PDFs/Documents:** Store the raw file content for document search.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ dependencies = [
"pydantic>=2.12.4",
"pytest>=9.0.1",
"pytest-asyncio>=1.3.0",
"Pillow>=11.0.0",
]
123 changes: 123 additions & 0 deletions tests/py/test_multimodal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors

import pytest
try:
import lancedb
import numpy as np
import pyarrow as pa
import io
from PIL import Image
except ImportError:
pass

# --8<-- [start:multimodal_imports]
import lancedb
import pyarrow as pa
import pandas as pd
import numpy as np
import io
from PIL import Image
# --8<-- [end:multimodal_imports]

def test_multimodal_ingestion(db_path_factory):
# Ensure dependencies are available
pytest.importorskip("PIL")
pytest.importorskip("lancedb")
pytest.importorskip("numpy")

# --8<-- [start:create_dummy_data]
# Create some dummy images
def create_dummy_image(color):
img = Image.new('RGB', (100, 100), color=color)
buf = io.BytesIO()
img.save(buf, format='PNG')
return buf.getvalue()

# Create dataset with metadata, vectors, and image blobs
data = [
{
"id": 1,
"filename": "red_square.png",
"vector": np.random.rand(128).astype(np.float32),
"image_blob": create_dummy_image('red'),
"label": "red"
},
{
"id": 2,
"filename": "blue_square.png",
"vector": np.random.rand(128).astype(np.float32),
"image_blob": create_dummy_image('blue'),
"label": "blue"
}
]
# --8<-- [end:create_dummy_data]

# --8<-- [start:define_schema]
# Define schema explictly to ensure image_blob is treated as binary
schema = pa.schema([
pa.field("id", pa.int32()),
pa.field("filename", pa.string()),
pa.field("vector", pa.list_(pa.float32(), 128)),
pa.field("image_blob", pa.binary()), # Important: Use pa.binary() for blobs
pa.field("label", pa.string())
])
# --8<-- [end:define_schema]

db_uri = db_path_factory("multimodal_db")
db = lancedb.connect(db_uri)

# --8<-- [start:ingest_data]
tbl = db.create_table("images", data=data, schema=schema, mode="overwrite")
# --8<-- [end:ingest_data]

assert len(tbl) == 2

# --8<-- [start:search_data]
# Search for similar images
query_vector = np.random.rand(128).astype(np.float32)
results = tbl.search(query_vector).limit(1).to_pandas()
# --8<-- [end:search_data]

# --8<-- [start:process_results]
# Convert back to PIL Image
for _, row in results.iterrows():
image_bytes = row['image_blob']
image = Image.open(io.BytesIO(image_bytes))
print(f"Retrieved image: {row['filename']}, Size: {image.size}")
# You can now use 'image' with other libraries or display it
# --8<-- [end:process_results]

assert len(results) == 1

def test_blob_api_definition(db_path_factory):
# --8<-- [start:blob_api_schema]
import pyarrow as pa

# Define schema with Blob API metadata for lazy loading
schema = pa.schema([
pa.field("id", pa.int64()),
pa.field(
"video",
pa.large_binary(),
metadata={"lance-encoding:blob": "true"} # Enable Blob API
),
])
# --8<-- [end:blob_api_schema]

# --8<-- [start:blob_api_ingest]
import lancedb
import lance

db = lancedb.connect(db_path_factory("blob_db"))

# Create sample data
data = [
{"id": 1, "video": b"fake_video_bytes_1"},
{"id": 2, "video": b"fake_video_bytes_2"}
]

# Create the table
tbl = db.create_table("videos", data=data, schema=schema)
# --8<-- [end:blob_api_ingest]
assert len(tbl) == 2
Loading
Loading