Initial commit: OpenClaw RAG Knowledge System
- Full RAG system for OpenClaw agents - Semantic search across chat history, code, docs, skills - ChromaDB integration (all-MiniLM-L6-v2 embeddings) - Automatic AI context retrieval - Ingest pipelines for sessions, workspace, skills - Python API and CLI interfaces - Document management (add, delete, stats, reset)
This commit is contained in:
289
rag_system.py
Normal file
289
rag_system.py
Normal file
@@ -0,0 +1,289 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
OpenClaw RAG System - Core Library
|
||||
Manages vector store, ingestion, and retrieval with ChromaDB
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional
|
||||
from datetime import datetime
|
||||
|
||||
try:
|
||||
import chromadb
|
||||
from chromadb.config import Settings
|
||||
CHROMADB_AVAILABLE = True
|
||||
except ImportError:
|
||||
CHROMADB_AVAILABLE = False
|
||||
|
||||
|
||||
class RAGSystem:
|
||||
"""OpenClaw RAG System for knowledge retrieval"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
persist_directory: str = None,
|
||||
collection_name: str = "openclaw_knowledge",
|
||||
embedding_model: str = "all-MiniLM-L6-v2"
|
||||
):
|
||||
"""
|
||||
Initialize RAG system
|
||||
|
||||
Args:
|
||||
persist_directory: Where ChromaDB stores data
|
||||
collection_name: Name of the collection
|
||||
embedding_model: Embedding model name ( ChromaDB handles this)
|
||||
"""
|
||||
if not CHROMADB_AVAILABLE:
|
||||
raise ImportError("chromadb not installed. Run: pip3 install chromadb")
|
||||
|
||||
self.collection_name = collection_name
|
||||
|
||||
# Default to ~/.openclaw/data/rag if not specified
|
||||
if persist_directory is None:
|
||||
persist_directory = os.path.expanduser("~/.openclaw/data/rag")
|
||||
|
||||
self.persist_directory = Path(persist_directory)
|
||||
self.persist_directory.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Initialize ChromaDB client
|
||||
self.client = chromadb.PersistentClient(
|
||||
path=str(self.persist_directory),
|
||||
settings=Settings(
|
||||
anonymized_telemetry=False,
|
||||
allow_reset=True
|
||||
)
|
||||
)
|
||||
|
||||
# Get or create collection
|
||||
self.collection = self.client.get_or_create_collection(
|
||||
name=collection_name,
|
||||
metadata={
|
||||
"created": datetime.now().isoformat(),
|
||||
"description": "OpenClaw knowledge base"
|
||||
}
|
||||
)
|
||||
|
||||
def add_document(
|
||||
self,
|
||||
text: str,
|
||||
metadata: Dict,
|
||||
doc_id: Optional[str] = None
|
||||
) -> str:
|
||||
"""
|
||||
Add a document to the vector store
|
||||
|
||||
Args:
|
||||
text: Document content
|
||||
metadata: Document metadata (type, source, date, etc.)
|
||||
doc_id: Optional document ID (auto-generated if not provided)
|
||||
|
||||
Returns:
|
||||
Document ID
|
||||
"""
|
||||
# Generate ID if not provided (include more context for uniqueness)
|
||||
if doc_id is None:
|
||||
unique_str = ":".join([
|
||||
metadata.get('type', 'unknown'),
|
||||
metadata.get('source', 'unknown'),
|
||||
metadata.get('date', datetime.now().isoformat()),
|
||||
str(metadata.get('chunk_index', '0')), # Convert to string!
|
||||
text[:200]
|
||||
])
|
||||
doc_id = hashlib.md5(unique_str.encode()).hexdigest()
|
||||
|
||||
# Add to collection
|
||||
self.collection.add(
|
||||
documents=[text],
|
||||
metadatas=[metadata],
|
||||
ids=[doc_id]
|
||||
)
|
||||
|
||||
return doc_id
|
||||
|
||||
def add_documents_batch(
|
||||
self,
|
||||
documents: List[Dict],
|
||||
batch_size: int = 100
|
||||
) -> List[str]:
|
||||
"""
|
||||
Add multiple documents efficiently
|
||||
|
||||
Args:
|
||||
documents: List of {"text": str, "metadata": dict, "id": optional} dicts
|
||||
batch_size: Number of documents to add per batch
|
||||
|
||||
Returns:
|
||||
List of document IDs
|
||||
"""
|
||||
all_ids = []
|
||||
|
||||
for i in range(0, len(documents), batch_size):
|
||||
batch = documents[i:i + batch_size]
|
||||
|
||||
texts = [doc["text"] for doc in batch]
|
||||
metadatas = [doc["metadata"] for doc in batch]
|
||||
ids = [doc.get("id", hashlib.md5(
|
||||
f"{doc['metadata'].get('type', 'unknown')}:{doc['metadata'].get('source', 'unknown')}:{doc['metadata'].get('date', '')}:{str(doc['metadata'].get('chunk_index', '0'))}:{doc['text'][:100]}".encode()
|
||||
).hexdigest()) for doc in batch]
|
||||
|
||||
self.collection.add(
|
||||
documents=texts,
|
||||
metadatas=metadatas,
|
||||
ids=ids
|
||||
)
|
||||
|
||||
all_ids.extend(ids)
|
||||
print(f"✅ Added batch {i//batch_size + 1}: {len(ids)} documents")
|
||||
|
||||
return all_ids
|
||||
|
||||
def search(
|
||||
self,
|
||||
query: str,
|
||||
n_results: int = 10,
|
||||
filters: Optional[Dict] = None
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Search for relevant documents
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
n_results: Number of results to return
|
||||
filters: Optional metadata filters
|
||||
|
||||
Returns:
|
||||
List of {"text": str, "metadata": dict, "id": str, "score": float} dicts
|
||||
"""
|
||||
results = self.collection.query(
|
||||
query_texts=[query],
|
||||
n_results=n_results,
|
||||
where=filters
|
||||
)
|
||||
|
||||
# Format results
|
||||
formatted = []
|
||||
for i, doc_id in enumerate(results['ids'][0]):
|
||||
formatted.append({
|
||||
"id": doc_id,
|
||||
"text": results['documents'][0][i],
|
||||
"metadata": results['metadatas'][0][i],
|
||||
# Note: ChromaDB doesn't return scores by default in query()
|
||||
# We'd need to use a different method or approximate
|
||||
"score": 1.0 - (i / len(results['ids'][0])) # Simple approximation
|
||||
})
|
||||
|
||||
return formatted
|
||||
|
||||
def delete_document(self, doc_id: str) -> bool:
|
||||
"""Delete a document by ID"""
|
||||
try:
|
||||
self.collection.delete(ids=[doc_id])
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"❌ Error deleting document {doc_id}: {e}")
|
||||
return False
|
||||
|
||||
def delete_by_filter(self, filter_dict: Dict) -> int:
|
||||
"""
|
||||
Delete documents by metadata filter
|
||||
|
||||
Args:
|
||||
filter_dict: Filter criteria (e.g., {"source": "session-2026-02-10"})
|
||||
|
||||
Returns:
|
||||
Number of documents deleted
|
||||
"""
|
||||
# First, find matching IDs
|
||||
results = self.collection.get(where=filter_dict)
|
||||
|
||||
if not results['ids']:
|
||||
return 0
|
||||
|
||||
count = len(results['ids'])
|
||||
self.collection.delete(ids=results['ids'])
|
||||
|
||||
print(f"✅ Deleted {count} documents matching filter")
|
||||
return count
|
||||
|
||||
def get_stats(self) -> Dict:
|
||||
"""Get statistics about the collection"""
|
||||
count = self.collection.count()
|
||||
|
||||
# Get sample to understand metadata structure
|
||||
sample = self.collection.get(limit=10)
|
||||
|
||||
# Count by source/type
|
||||
source_counts = {}
|
||||
type_counts = {}
|
||||
|
||||
for metadata in sample['metadatas']:
|
||||
source = metadata.get('source', 'unknown')
|
||||
doc_type = metadata.get('type', 'unknown')
|
||||
|
||||
source_counts[source] = source_counts.get(source, 0) + 1
|
||||
type_counts[doc_type] = type_counts.get(doc_type, 0) + 1
|
||||
|
||||
return {
|
||||
"total_documents": count,
|
||||
"collection_name": self.collection_name,
|
||||
"persist_directory": str(self.persist_directory),
|
||||
"source_distribution": source_counts,
|
||||
"type_distribution": type_counts
|
||||
}
|
||||
|
||||
def reset_collection(self):
|
||||
"""Delete all documents and reset the collection"""
|
||||
self.collection.delete(where={})
|
||||
print("✅ Collection reset - all documents deleted")
|
||||
|
||||
def close(self):
|
||||
"""Close the connection"""
|
||||
# ChromaDB PersistentClient doesn't need explicit close
|
||||
pass
|
||||
|
||||
|
||||
def main():
|
||||
"""Test the RAG system"""
|
||||
print("🚀 Testing OpenClaw RAG System...\n")
|
||||
|
||||
# Initialize
|
||||
rag = RAGSystem()
|
||||
print(f"✅ Initialized RAG system")
|
||||
print(f" Collection: {rag.collection_name}")
|
||||
print(f" Storage: {rag.persist_directory}\n")
|
||||
|
||||
# Add test document
|
||||
test_doc = {
|
||||
"text": "OpenClaw is a personal AI assistant with tools for automation, messaging, and infrastructure management. It supports Discord, Telegram, SMS via VoIP.ms, and more.",
|
||||
"metadata": {
|
||||
"type": "test",
|
||||
"source": "test-initialization",
|
||||
"date": datetime.now().isoformat()
|
||||
}
|
||||
}
|
||||
|
||||
doc_id = rag.add_document(test_doc["text"], test_doc["metadata"])
|
||||
print(f"✅ Added test document: {doc_id}\n")
|
||||
|
||||
# Search
|
||||
results = rag.search(
|
||||
query="What messaging platforms does OpenClaw support?",
|
||||
n_results=5
|
||||
)
|
||||
|
||||
print("🔍 Search Results:")
|
||||
for i, result in enumerate(results, 1):
|
||||
print(f"\n{i}. [{result['metadata'].get('source', '?')}]")
|
||||
print(f" {result['text'][:200]}...")
|
||||
|
||||
# Stats
|
||||
stats = rag.get_stats()
|
||||
print(f"\n📊 Stats:")
|
||||
print(f" Total documents: {stats['total_documents']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user