Skip to main content

Build vector pipelines

The vectorstore package provides document processing, embedding, and vector storage. Install with pip install rakam-systems-vectorstore[all] (requires core).

Core data structures

from rakam_systems_vectorstore.core import Node, NodeMetadata, VSFile

# VSFile - Represents a document source
vsfile = VSFile(file_path="/path/to/document.pdf")
print(vsfile.uuid, vsfile.file_name, vsfile.mime_type)

# NodeMetadata - Metadata for document chunks
metadata = NodeMetadata(
source_file_uuid=str(vsfile.uuid),
position=0, # Page number or chunk position
custom={"author": "John", "date": "2024-01-01"}
)

# Node - A chunk with content and metadata
node = Node(content="Document content here...", metadata=metadata)
node.embedding = [0.1, 0.2, 0.3, ...] # Set after embedding

Embeddings

Multi-backend embedding model with unified interface:

from rakam_systems_vectorstore import ConfigurableEmbeddings, create_embedding_model

# Using Sentence Transformers (local)
embeddings = ConfigurableEmbeddings(config={
"model_type": "sentence_transformer",
"model_name": "Snowflake/snowflake-arctic-embed-m",
"batch_size": 128,
"normalize": True
})

# Using OpenAI (with batch processing)
embeddings = ConfigurableEmbeddings(config={
"model_type": "openai",
"model_name": "text-embedding-3-small",
"api_key": "...", # Or use OPENAI_API_KEY
"batch_size": 100
})

# Using Cohere
embeddings = ConfigurableEmbeddings(config={
"model_type": "cohere",
"model_name": "embed-english-v3.0",
"api_key": "..." # Or use COHERE_API_KEY
})

# Using HuggingFace models with authentication
embeddings = ConfigurableEmbeddings(config={
"model_type": "sentence_transformer",
"model_name": "private/model-name",
# Uses HUGGINGFACE_TOKEN environment variable
})

embeddings.setup()

# Encode texts with automatic batch processing
vectors = embeddings.run(["Hello world", "How are you?"])

# Encode queries (optimized for single texts)
query_vector = embeddings.encode_query("What is AI?")

# Encode documents (optimized for batches)
doc_vectors = embeddings.encode_documents(documents)

# Get dimension
dim = embeddings.embedding_dimension

Performance features: automatic batch processing with progress tracking, memory optimization with garbage collection, token truncation for oversized texts, CUDA memory management for GPU acceleration.

Factory function

embeddings = create_embedding_model(
model_type="sentence_transformer",
model_name="all-MiniLM-L6-v2",
batch_size=64
)

Document loading

AdaptiveLoader

Automatically detects and processes various file types:

from rakam_systems_vectorstore import AdaptiveLoader, create_adaptive_loader

loader = AdaptiveLoader(config={
"encoding": "utf-8",
"chunk_size": 512,
"chunk_overlap": 50
})

# Supported file types:
# - Text: .txt, .text
# - Markdown: .md, .markdown
# - Documents: .pdf, .docx, .doc, .odt
# - Email: .eml, .msg
# - Data: .json, .csv, .tsv, .xlsx, .xls
# - HTML: .html, .htm, .xhtml
# - Code: .py, .js, .ts, .java, .cpp, .go, .rs, .rb, etc.

# Load as single text
text = loader.load_as_text("document.pdf")

# Load as chunks
chunks = loader.load_as_chunks("document.pdf")

# Load as nodes (with metadata)
nodes = loader.load_as_nodes("document.pdf", custom_metadata={"category": "science"})

# Load as VSFile
vsfile = loader.load_as_vsfile("document.pdf")

# Also handles raw text
chunks = loader.load_as_chunks("This is raw text content...")

Factory function:

loader = create_adaptive_loader(
chunk_size=1024,
chunk_overlap=100,
encoding='utf-8'
)

Specialized loaders

Located in rakam_systems_vectorstore/components/loader/:

LoaderFile typesFeatures
PdfLoader.pdfAdvanced PDF processing with Docling, image extraction, table detection
PdfLoaderLight.pdfLightweight PDF processing with pymupdf4llm, markdown conversion, image extraction
DocLoader.docx, .docMicrosoft Word documents, image extraction
OdtLoader.odtOpenDocument Text, image extraction
MdLoader.mdMarkdown with structure preservation, YAML frontmatter
HtmlLoader.html, .htmHTML parsing and text extraction
EmlLoader.eml, .msgEmail files (loaded as single nodes)
TabularLoader.csv, .tsv, .xlsxTabular data processing, preserves column structure
CodeLoader.py, .js, etc.Code-aware chunking with syntax preservation

PdfLoaderLight

A lightweight alternative to PdfLoader using pymupdf4llm:

from rakam_systems_vectorstore.components.loader import PdfLoaderLight

loader = PdfLoaderLight(
name="pdf_loader_light",
config={
"chunk_size": 512,
"chunk_overlap": 50,
"extract_images": True,
"image_path": "./extracted_images",
"page_chunks": True,
"write_images": True
}
)

markdown_text = loader.load_as_text("document.pdf")
chunks = loader.load_as_chunks("document.pdf")
nodes = loader.load_as_nodes("document.pdf")

# Access extracted images
image_paths = loader.get_image_paths()
for img_id, img_path in image_paths.items():
print(f"Image {img_id}: {img_path}")

Image extraction support

Multiple loaders support image extraction:

from rakam_systems_vectorstore.components.loader import DocLoader, OdtLoader, PdfLoaderLight

doc_loader = DocLoader(config={
"extract_images": True,
"image_path": "./doc_images"
})
nodes = doc_loader.load_as_nodes("document.docx")

for img_id, img_path in doc_loader.get_image_paths().items():
print(f"Image {img_id}: {img_path}")

Chunking

TextChunker

Sentence-based text chunking using Chonkie:

from rakam_systems_vectorstore.components.chunker import TextChunker, create_text_chunker

chunker = TextChunker(
chunk_size=512, # Tokens per chunk
chunk_overlap=50, # Overlap in tokens
min_sentences_per_chunk=1,
tokenizer="character" # Or "gpt2", HuggingFace tokenizer
)

chunks = chunker.chunk_text("Long document text...")
# Returns: [{"text": "...", "token_count": 100, "start_index": 0, "end_index": 500}, ...]

# Process multiple documents
all_chunks = chunker.run(["doc1 text", "doc2 text"])

AdvancedChunker

Context-aware chunking using Docling with heading preservation:

from rakam_systems_vectorstore.components.chunker import AdvancedChunker

chunker = AdvancedChunker(
name="advanced_chunker",
config={
"max_tokens": 512,
"merge_peers": True,
"min_chunk_tokens": 64,
"filter_toc": True,
"include_heading_markers": True
}
)

chunks = chunker.chunk_text("Document text with headings...")
# Each chunk includes: text, token_count, start_index, end_index, heading_context

Features: context-aware chunking with heading hierarchy, automatic merging of small chunks, table of contents filtering, image and table fragment handling, markdown heading markers support.