From 0c2f98004ab1e5970a16248a5eb8c5f2831ba346 Mon Sep 17 00:00:00 2001 From: Nova AI Date: Wed, 11 Feb 2026 15:58:48 +0000 Subject: [PATCH] Add automatic daily RAG updates + cron job integration --- README.md | 39 +++++++++-- scripts/rag-auto-update.sh | 128 +++++++++++++++++++++++++++++++++++++ 2 files changed, 163 insertions(+), 4 deletions(-) create mode 100755 scripts/rag-auto-update.sh diff --git a/README.md b/README.md index 2333ef6..4ab27d2 100644 --- a/README.md +++ b/README.md @@ -223,14 +223,45 @@ If you see "Expected IDs to be unique" errors: On first run, ChromaDB downloads the embedding model (~79MB). This takes 1-2 minutes. Let it complete. +## Automatic Updates + +### Setup Scheduled Indexing + +The RAG system includes an automatic update script that runs daily: + +```bash +# Manual test +bash /home/william/.openclaw/workspace/scripts/rag-auto-update.sh +``` + +**What it does:** +- Detects new/updated chat sessions and re-indexes them +- Re-indexes workspace files (captures code changes) +- Updates skill documentation +- Maintains state to avoid re-processing unchanged files +- Runs via cron at 4:00 AM UTC daily + +**Configuration:** +```bash +# View cron job +openclaw cron list + +# Edit schedule (if needed) +openclaw cron update --schedule "{\"expr\":\"0 4 * * *\"}" +``` + +**State tracking:** `~/.openclaw/workspace/memory/rag-auto-state.json` +**Log file:** `~/.openclaw/workspace/memory/rag-auto-update.log` + ## Best Practices -### Re-index Regularly +### Automatic Update Enabled -After significant work, re-ingest to keep knowledge current: +The RAG system now automatically updates daily - no manual re-indexing needed. + +After significant work, you can still manually update: ```bash -python3 ingest_sessions.py -python3 ingest_docs.py workspace +bash /home/william/.openclaw/workspace/scripts/rag-auto-update.sh ``` ### Use Specific Queries diff --git a/scripts/rag-auto-update.sh b/scripts/rag-auto-update.sh new file mode 100755 index 0000000..e332d4d --- /dev/null +++ b/scripts/rag-auto-update.sh @@ -0,0 +1,128 @@ +#!/bin/bash +# RAG Automatic Ingestion - Daily update of knowledge base +# This script checks for new content and intelligently updates the RAG system + +set -e + +# Paths +RAG_DIR="/home/william/.openclaw/workspace/rag" +STATE_FILE="/home/william/.openclaw/workspace/memory/rag-auto-state.json" +LOG_FILE="/home/william/.openclaw/workspace/memory/rag-auto-update.log" + +# Timestamp +TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ") +DATE=$(date +%Y-%m-%d) + +# Create memory directory +mkdir -p "$(dirname "$STATE_FILE")" + +# Initialize state if needed +if [ ! -f "$STATE_FILE" ]; then + cat > "$STATE_FILE" << EOF +{ + "lastSessionIndex": 0, + "lastWorkspaceIndex": 0, + "lastSkillsIndex": 0, + "updatedAt": "$TIMESTAMP" +} +EOF +fi + +# log function +log() { + echo "[$TIMESTAMP] $1" | tee -a "$LOG_FILE" +} + +# Get latest session file modification time +latest_session_time() { + find ~/.openclaw/agents/main/sessions -name "*.jsonl" -type f -printf '%T@\n' 2>/dev/null | sort -rn | head -1 | cut -d. -f1 || echo "0" +} + +log "=== RAG Auto-Update Started ===" + +# Get current stats +SESSION_COUNT=$(find ~/.openclaw/agents/main/sessions -name "*.jsonl" | wc -l) +WORKSPACE_COUNT=$(find ~/.openclaw/workspace -type f \( -name "*.py" -o -name "*.js" -o -name "*.md" -o -name "*.json" \) | wc -l) +LATEST_SESSION=$(latest_session_time) + +# Read last indexed timestamp +LAST_SESSION_INDEX=$(python3 -c "import json; print(json.load(open('$STATE_FILE')).get('lastSessionIndex', 0))" 2>/dev/null || echo "0") + +log "Current status:" +log " Sessions: $SESSION_COUNT files" +log " Workspace: $WORKSPACE_COUNT searchable files" +log " Latest session: $LATEST_SESSION" +log " Last indexed: $LAST_SESSION_INDEX" + +# Update sessions if new ones exist +if [ "$LATEST_SESSION" -gt "$LAST_SESSION_INDEX" ]; then + log "✓ New/updated sessions detected, re-indexing..." + + cd "$RAG_DIR" + python3 ingest_sessions.py --sessions-dir ~/.openclaw/agents/main/sessions >> "$LOG_FILE" 2>&1 + + if [ $? -eq 0 ]; then + log "✅ Sessions re-indexed successfully" + else + log "❌ Session indexing failed (see log)" + exit 1 + fi +else + log "✓ Sessions up to date (no new files)" +fi + +# Update workspace (always do it - captures code changes) +log "Re-indexing workspace files..." +cd "$RAG_DIR" +python3 ingest_docs.py workspace >> "$LOG_FILE" 2>&1 + +if [ $? -eq 0 ]; then + log "✅ Workspace re-indexed successfully" +else + log "❌ Workspace indexing failed (see log)" + exit 1 +fi + +# Update skills +log "Re-indexing skills..." +cd "$RAG_DIR" +python3 ingest_docs.py skills >> "$LOG_FILE" 2>&1 + +if [ $? -eq 0 ]; then + log "✅ Skills re-indexed successfully" +else + log "❌ Skills indexing failed (see log)" + exit 1 +fi + +# Get document count +DOC_COUNT=$(cd "$RAG_DIR" && python3 -c " +import sys +sys.path.insert(0, '.') +from rag_system import get_collection +collection = get_collection() +print(collection.count()) +" 2>/dev/null || echo "unknown") + +# Update state +python3 << EOF +import json + +state = { + "lastSessionIndex": $LATEST_SESSION, + "lastWorkspaceIndex": $(date +%s), + "lastSkillsIndex": $(date +%s), + "updatedAt": "$TIMESTAMP", + "totalDocuments": $DOC_COUNT, + "sessionCount": $SESSION_COUNT +} + +with open('$STATE_FILE', 'w') as f: + json.dump(state, f, indent=2) +EOF + +log "=== RAG Auto-Update Complete ===" +log "Total documents in knowledge base: $DOC_COUNT" +log "Next run: $(date -u -d '+24 hours' +%Y-%m-%dT%H:%M:%SZ)" + +exit 0 \ No newline at end of file