Initial commit: OpenClaw RAG Knowledge System

- Full RAG system for OpenClaw agents - Semantic search across chat history, code, docs, skills - ChromaDB integration (all-MiniLM-L6-v2 embeddings) - Automatic AI context retrieval - Ingest pipelines for sessions, workspace, skills - Python API and CLI interfaces - Document management (add, delete, stats, reset)
2026-02-11 03:47:38 +00:00
commit b272748209
11 changed files with 2362 additions and 0 deletions
--- a/rag_agent.py
+++ b/rag_agent.py
@@ -0,0 +1,213 @@
+#!/usr/bin/env python3
+"""
+RAG-Enhanced OpenClaw Agent
+
+This agent automatically retrieves relevant context from the knowledge base
+before responding, providing conversation history, code, and documentation context.
+
+Usage:
+    python3 /path/to/rag_agent.py <user_message> <session_jsonl_file>
+
+Or integrate into OpenClaw as an agent wrapper.
+"""
+
+import sys
+import json
+from pathlib import Path
+
+# Add parent directory to import RAG system
+current_dir = Path(__file__).parent
+sys.path.insert(0, str(current_dir))
+
+from rag_system import RAGSystem
+
+
+def extract_user_query(messages: list) -> str:
+    """
+    Extract the most recent user message from conversation history.
+
+    Args:
+        messages: List of message objects
+
+    Returns:
+        User query string
+    """
+    # Find the last user message
+    for msg in reversed(messages):
+        role = msg.get('role')
+
+        if role == 'user':
+            content = msg.get('content', '')
+
+            # Handle different content formats
+            if isinstance(content, str):
+                return content
+            elif isinstance(content, list):
+                # Extract text from list format
+                text_parts = []
+                for item in content:
+                    if isinstance(item, dict) and item.get('type') == 'text':
+                        text_parts.append(item.get('text', ''))
+                return ' '.join(text_parts)
+
+    return ''
+
+
+def search_relevant_context(query: str, rag: RAGSystem, max_results: int = 5) -> str:
+    """
+    Search the knowledge base for relevant context.
+
+    Args:
+        query: User's question
+        rag: RAGSystem instance
+        max_results: Maximum results to return
+
+    Returns:
+        Formatted context string
+    """
+    if not query or len(query) < 3:
+        return ''
+
+    try:
+        # Search for relevant context
+        results = rag.search(query, n_results=max_results)
+
+        if not results:
+            return ''
+
+        # Format the results
+        context_parts = []
+        context_parts.append(f"Found {len(results)} relevant context items:\n")
+
+        for i, result in enumerate(results, 1):
+            metadata = result.get('metadata', {})
+            doc_type = metadata.get('type', 'unknown')
+            source = metadata.get('source', 'unknown')
+
+            # Header based on type
+            if doc_type == 'session':
+                header = f"[Session Reference {i}]"
+            elif doc_type == 'workspace':
+                header = f"[Code/Docs {i}: {source}]"
+            elif doc_type == 'skill':
+                header = f"[Skill Reference {i}: {source}]"
+            else:
+                header = f"[Reference {i}]"
+
+            # Truncate long content
+            text = result.get('text', '')
+            if len(text) > 800:
+                text = text[:800] + "..."
+
+            context_parts.append(f"{header}\n{text}\n")
+
+        return '\n'.join(context_parts)
+
+    except Exception as e:
+        # Fail silently - RAG shouldn't break conversations
+        return ''
+
+
+def enhance_message_with_rag(
+    message_content: str,
+    conversation_history: list,
+    collection_name: str = "openclaw_knowledge"
+) -> str:
+    """
+    Enhance a user message with relevant RAG context.
+
+    This is the main integration point. Call this before sending messages to the LLM.
+
+    Args:
+        message_content: The current user message
+        conversation_history: Recent conversation messages
+        collection_name: ChromaDB collection name
+
+    Returns:
+        Enhanced message string with RAG context prepended
+    """
+    try:
+        # Initialize RAG system
+        rag = RAGSystem(collection_name=collection_name)
+
+        # Extract user query
+        user_query = extract_user_query([{'role': 'user', 'content': message_content}] + conversation_history)
+
+        # Search for relevant context
+        context = search_relevant_context(user_query, rag, max_results=5)
+
+        if not context:
+            return message_content
+
+        # Prepend context to the message
+        enhanced = f"""[RAG CONTEXT - Retrieved from knowledge base:]
+{context}
+
+---
+
+[CURRENT USER MESSAGE:]
+{message_content}"""
+
+        return enhanced
+
+    except Exception as e:
+        # Fail silently - return original message if RAG fails
+        return message_content
+
+
+def get_response_with_rag(
+    user_message: str,
+    session_jsonl: str = None,
+    collection_name: str = "openclaw_knowledge"
+) -> str:
+    """
+    Get an AI response with automatic RAG-enhanced context.
+
+    This is a helper function that can be called from scripts.
+
+    Args:
+        user_message: The user's question
+        session_jsonl: Path to session file (for conversation history)
+        collection_name: ChromaDB collection name
+
+    Returns:
+        Enhanced message ready for LLM processing
+    """
+    # Load conversation history if session file provided
+    conversation_history = []
+    if session_jsonl and Path(session_jsonl).exists():
+        try:
+            with open(session_jsonl, 'r') as f:
+                for line in f:
+                    if line.strip():
+                        event = json.loads(line)
+                        if event.get('type') == 'message':
+                            msg = event.get('message', {})
+                            conversation_history.append(msg)
+        except:
+            pass
+
+    # Enhance message
+    return enhance_message_with_rag(user_message, conversation_history, collection_name)
+
+
+if __name__ == "__main__":
+    # Command-line interface for testing
+    if len(sys.argv) < 2:
+        print("Usage: python3 rag_agent.py <user_message> [session_jsonl]")
+        print("\nOr import and use:")
+        print("  from rag.rag_agent import enhance_message_with_rag")
+        print("  enhanced = enhance_message_with_rag(user_message, history)")
+        sys.exit(1)
+
+    user_message = sys.argv[1]
+    session_jsonl = sys.argv[2] if len(sys.argv) > 2 else None
+
+    # Get enhanced message
+    enhanced = get_response_with_rag(user_message, session_jsonl)
+
+    print("\n" + "="*80)
+    print("ENHANCED MESSAGE (Ready for LLM):")
+    print("="*80)
+    print(enhanced)
+    print("="*80 + "\n")