Files
openclaw-rag-skill/ingest_docs.py
Nova AI b272748209 Initial commit: OpenClaw RAG Knowledge System
- Full RAG system for OpenClaw agents
- Semantic search across chat history, code, docs, skills
- ChromaDB integration (all-MiniLM-L6-v2 embeddings)
- Automatic AI context retrieval
- Ingest pipelines for sessions, workspace, skills
- Python API and CLI interfaces
- Document management (add, delete, stats, reset)
2026-02-11 03:47:38 +00:00

265 lines
7.6 KiB
Python

#!/usr/bin/env python3
"""
RAG Document Ingestor - Load workspace files, skills, docs into vector store
"""
import os
import json
from pathlib import Path
from datetime import datetime
from typing import List, Dict
# Add parent directory to path
import sys
sys.path.insert(0, str(Path(__file__).parent))
from rag_system import RAGSystem
def read_file_safe(file_path: Path) -> str:
"""Read file with error handling"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
except Exception as e:
print(f" ⚠️ Error reading: {e}")
return None
def is_text_file(file_path: Path, max_size_mb: float = 1.0) -> bool:
"""Check if file is text and not too large"""
# Skip binary files
if file_path.suffix.lower() in ['.pyc', '.so', '.o', '.a', '.png', '.jpg', '.jpeg', '.gif', '.zip', '.tar', '.gz']:
return False
# Check size
try:
size_mb = file_path.stat().st_size / (1024 * 1024)
if size_mb > max_size_mb:
return False
except:
return False
return True
def chunk_text(text: str, max_chars: int = 4000, overlap: int = 200) -> List[str]:
"""Split text into chunks"""
chunks = []
if len(text) <= max_chars:
return [text]
# Simple splitting by newline for now (could be improved to split at sentences)
paragraphs = text.split('\n\n')
current_chunk = ""
for para in paragraphs:
if len(current_chunk) + len(para) + 2 <= max_chars:
current_chunk += para + "\n\n"
else:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = para + "\n\n"
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
def ingest_workspace(
workspace_dir: str = None,
collection_name: str = "openclaw_knowledge",
file_patterns: List[str] = None
):
"""
Ingest workspace files into RAG system
Args:
workspace_dir: Path to workspace directory
collection_name: Name of the ChromaDB collection
file_patterns: List of file patterns to include (default: all)
"""
if workspace_dir is None:
workspace_dir = os.path.expanduser("~/.openclaw/workspace")
workspace_path = Path(workspace_dir)
if not workspace_path.exists():
print(f"❌ Workspace not found: {workspace_dir}")
return
print(f"🔍 Scanning workspace: {workspace_path}")
# Default file patterns
if file_patterns is None:
file_patterns = [
"*.md", "*.py", "*.js", "*.ts", "*.json", "*.yaml", "*.yml",
"*.txt", "*.sh", "*.html", "*.css"
]
# Find all matching files
all_files = []
for pattern in file_patterns:
for file_path in workspace_path.rglob(pattern):
if is_text_file(file_path):
all_files.append(file_path)
if not all_files:
print(f"⚠️ No files found")
return
print(f"✅ Found {len(all_files)} files\n")
# Initialize RAG
rag = RAGSystem(collection_name=collection_name)
total_chunks = 0
for file_path in all_files[:100]: # Limit to 100 files for testing
relative_path = file_path.relative_to(workspace_path)
print(f"\n📄 {relative_path}")
# Read file
content = read_file_safe(file_path)
if content is None:
continue
# Chunk if too large
if len(content) > 4000:
text_chunks = chunk_text(content)
print(f" Chunks: {len(text_chunks)}")
else:
text_chunks = [content]
print(f" Size: {len(content)} chars")
# Add each chunk
for i, chunk in enumerate(text_chunks):
metadata = {
"type": "workspace",
"source": str(relative_path),
"file_path": str(file_path),
"file_size": len(content),
"chunk_index": i,
"total_chunks": len(text_chunks),
"file_extension": file_path.suffix.lower(),
"ingested_at": datetime.now().isoformat()
}
doc_id = rag.add_document(chunk, metadata)
total_chunks += 1
print(f" ✅ Indexed {len(text_chunks)} chunk(s)")
print(f"\n📊 Summary:")
print(f" Files processed: {len([f for f in all_files[:100]])}")
print(f" Total chunks indexed: {total_chunks}")
stats = rag.get_stats()
print(f" Total documents in collection: {stats['total_documents']}")
def ingest_skills(
skills_base_dir: str = None,
collection_name: str = "openclaw_knowledge"
):
"""
Ingest all SKILL.md files from skills directory
Args:
skills_base_dir: Base directory for skills
collection_name: Name of the ChromaDB collection
"""
# Default to OpenClaw skills dir
if skills_base_dir is None:
# Check both system and workspace skills
system_skills = Path("/usr/lib/node_modules/openclaw/skills")
workspace_skills = Path(os.path.expanduser("~/.openclaw/workspace/skills"))
skills_dirs = [d for d in [system_skills, workspace_skills] if d.exists()]
if not skills_dirs:
print(f"❌ No skills directories found")
return
else:
skills_dirs = [Path(skills_base_dir)]
print(f"🔍 Scanning for skills...")
# Find all SKILL.md files
skill_files = []
for skills_dir in skills_dirs:
# Direct SKILL.md files
for skill_file in skills_dir.rglob("SKILL.md"):
skill_files.append(skill_file)
if not skill_files:
print(f"⚠️ No SKILL.md files found")
return
print(f"✅ Found {len(skill_files)} skills\n")
# Initialize RAG
rag = RAGSystem(collection_name=collection_name)
total_chunks = 0
for skill_file in skill_files:
# Determine skill name from path
if skill_file.name == "SKILL.md":
skill_name = skill_file.parent.name
else:
skill_name = skill_file.stem
print(f"\n📜 {skill_name}")
content = read_file_safe(skill_file)
if content is None:
continue
# Chunk skill documentation
chunks = chunk_text(content, max_chars=3000, overlap=100)
for i, chunk in enumerate(chunks):
metadata = {
"type": "skill",
"source": f"skill:{skill_name}",
"skill_name": skill_name,
"file_path": str(skill_file),
"chunk_index": i,
"total_chunks": len(chunks),
"ingested_at": datetime.now().isoformat()
}
doc_id = rag.add_document(chunk, metadata)
total_chunks += 1
print(f" ✅ Indexed {len(chunks)} chunk(s)")
print(f"\n📊 Summary:")
print(f" Skills processed: {len(skill_files)}")
print(f" Total chunks indexed: {total_chunks}")
stats = rag.get_stats()
print(f" Total documents in collection: {stats['total_documents']}")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Ingest files into OpenClaw RAG")
parser.add_argument("type", choices=["workspace", "skills"], help="What to ingest")
parser.add_argument("--path", help="Path to workspace or skills directory")
parser.add_argument("--collection", default="openclaw_knowledge", help="Collection name")
args = parser.parse_args()
if args.type == "workspace":
ingest_workspace(workspace_dir=args.path, collection_name=args.collection)
elif args.type == "skills":
ingest_skills(skills_base_dir=args.path, collection_name=args.collection)