Add automatic daily RAG updates + cron job integration
This commit is contained in:
39
README.md
39
README.md
@@ -223,14 +223,45 @@ If you see "Expected IDs to be unique" errors:
|
|||||||
|
|
||||||
On first run, ChromaDB downloads the embedding model (~79MB). This takes 1-2 minutes. Let it complete.
|
On first run, ChromaDB downloads the embedding model (~79MB). This takes 1-2 minutes. Let it complete.
|
||||||
|
|
||||||
|
## Automatic Updates
|
||||||
|
|
||||||
|
### Setup Scheduled Indexing
|
||||||
|
|
||||||
|
The RAG system includes an automatic update script that runs daily:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Manual test
|
||||||
|
bash /home/william/.openclaw/workspace/scripts/rag-auto-update.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
**What it does:**
|
||||||
|
- Detects new/updated chat sessions and re-indexes them
|
||||||
|
- Re-indexes workspace files (captures code changes)
|
||||||
|
- Updates skill documentation
|
||||||
|
- Maintains state to avoid re-processing unchanged files
|
||||||
|
- Runs via cron at 4:00 AM UTC daily
|
||||||
|
|
||||||
|
**Configuration:**
|
||||||
|
```bash
|
||||||
|
# View cron job
|
||||||
|
openclaw cron list
|
||||||
|
|
||||||
|
# Edit schedule (if needed)
|
||||||
|
openclaw cron update <job-id> --schedule "{\"expr\":\"0 4 * * *\"}"
|
||||||
|
```
|
||||||
|
|
||||||
|
**State tracking:** `~/.openclaw/workspace/memory/rag-auto-state.json`
|
||||||
|
**Log file:** `~/.openclaw/workspace/memory/rag-auto-update.log`
|
||||||
|
|
||||||
## Best Practices
|
## Best Practices
|
||||||
|
|
||||||
### Re-index Regularly
|
### Automatic Update Enabled
|
||||||
|
|
||||||
After significant work, re-ingest to keep knowledge current:
|
The RAG system now automatically updates daily - no manual re-indexing needed.
|
||||||
|
|
||||||
|
After significant work, you can still manually update:
|
||||||
```bash
|
```bash
|
||||||
python3 ingest_sessions.py
|
bash /home/william/.openclaw/workspace/scripts/rag-auto-update.sh
|
||||||
python3 ingest_docs.py workspace
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Use Specific Queries
|
### Use Specific Queries
|
||||||
|
|||||||
128
scripts/rag-auto-update.sh
Executable file
128
scripts/rag-auto-update.sh
Executable file
@@ -0,0 +1,128 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# RAG Automatic Ingestion - Daily update of knowledge base
|
||||||
|
# This script checks for new content and intelligently updates the RAG system
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Paths
|
||||||
|
RAG_DIR="/home/william/.openclaw/workspace/rag"
|
||||||
|
STATE_FILE="/home/william/.openclaw/workspace/memory/rag-auto-state.json"
|
||||||
|
LOG_FILE="/home/william/.openclaw/workspace/memory/rag-auto-update.log"
|
||||||
|
|
||||||
|
# Timestamp
|
||||||
|
TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
DATE=$(date +%Y-%m-%d)
|
||||||
|
|
||||||
|
# Create memory directory
|
||||||
|
mkdir -p "$(dirname "$STATE_FILE")"
|
||||||
|
|
||||||
|
# Initialize state if needed
|
||||||
|
if [ ! -f "$STATE_FILE" ]; then
|
||||||
|
cat > "$STATE_FILE" << EOF
|
||||||
|
{
|
||||||
|
"lastSessionIndex": 0,
|
||||||
|
"lastWorkspaceIndex": 0,
|
||||||
|
"lastSkillsIndex": 0,
|
||||||
|
"updatedAt": "$TIMESTAMP"
|
||||||
|
}
|
||||||
|
EOF
|
||||||
|
fi
|
||||||
|
|
||||||
|
# log function
|
||||||
|
log() {
|
||||||
|
echo "[$TIMESTAMP] $1" | tee -a "$LOG_FILE"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Get latest session file modification time
|
||||||
|
latest_session_time() {
|
||||||
|
find ~/.openclaw/agents/main/sessions -name "*.jsonl" -type f -printf '%T@\n' 2>/dev/null | sort -rn | head -1 | cut -d. -f1 || echo "0"
|
||||||
|
}
|
||||||
|
|
||||||
|
log "=== RAG Auto-Update Started ==="
|
||||||
|
|
||||||
|
# Get current stats
|
||||||
|
SESSION_COUNT=$(find ~/.openclaw/agents/main/sessions -name "*.jsonl" | wc -l)
|
||||||
|
WORKSPACE_COUNT=$(find ~/.openclaw/workspace -type f \( -name "*.py" -o -name "*.js" -o -name "*.md" -o -name "*.json" \) | wc -l)
|
||||||
|
LATEST_SESSION=$(latest_session_time)
|
||||||
|
|
||||||
|
# Read last indexed timestamp
|
||||||
|
LAST_SESSION_INDEX=$(python3 -c "import json; print(json.load(open('$STATE_FILE')).get('lastSessionIndex', 0))" 2>/dev/null || echo "0")
|
||||||
|
|
||||||
|
log "Current status:"
|
||||||
|
log " Sessions: $SESSION_COUNT files"
|
||||||
|
log " Workspace: $WORKSPACE_COUNT searchable files"
|
||||||
|
log " Latest session: $LATEST_SESSION"
|
||||||
|
log " Last indexed: $LAST_SESSION_INDEX"
|
||||||
|
|
||||||
|
# Update sessions if new ones exist
|
||||||
|
if [ "$LATEST_SESSION" -gt "$LAST_SESSION_INDEX" ]; then
|
||||||
|
log "✓ New/updated sessions detected, re-indexing..."
|
||||||
|
|
||||||
|
cd "$RAG_DIR"
|
||||||
|
python3 ingest_sessions.py --sessions-dir ~/.openclaw/agents/main/sessions >> "$LOG_FILE" 2>&1
|
||||||
|
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
log "✅ Sessions re-indexed successfully"
|
||||||
|
else
|
||||||
|
log "❌ Session indexing failed (see log)"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log "✓ Sessions up to date (no new files)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Update workspace (always do it - captures code changes)
|
||||||
|
log "Re-indexing workspace files..."
|
||||||
|
cd "$RAG_DIR"
|
||||||
|
python3 ingest_docs.py workspace >> "$LOG_FILE" 2>&1
|
||||||
|
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
log "✅ Workspace re-indexed successfully"
|
||||||
|
else
|
||||||
|
log "❌ Workspace indexing failed (see log)"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Update skills
|
||||||
|
log "Re-indexing skills..."
|
||||||
|
cd "$RAG_DIR"
|
||||||
|
python3 ingest_docs.py skills >> "$LOG_FILE" 2>&1
|
||||||
|
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
log "✅ Skills re-indexed successfully"
|
||||||
|
else
|
||||||
|
log "❌ Skills indexing failed (see log)"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Get document count
|
||||||
|
DOC_COUNT=$(cd "$RAG_DIR" && python3 -c "
|
||||||
|
import sys
|
||||||
|
sys.path.insert(0, '.')
|
||||||
|
from rag_system import get_collection
|
||||||
|
collection = get_collection()
|
||||||
|
print(collection.count())
|
||||||
|
" 2>/dev/null || echo "unknown")
|
||||||
|
|
||||||
|
# Update state
|
||||||
|
python3 << EOF
|
||||||
|
import json
|
||||||
|
|
||||||
|
state = {
|
||||||
|
"lastSessionIndex": $LATEST_SESSION,
|
||||||
|
"lastWorkspaceIndex": $(date +%s),
|
||||||
|
"lastSkillsIndex": $(date +%s),
|
||||||
|
"updatedAt": "$TIMESTAMP",
|
||||||
|
"totalDocuments": $DOC_COUNT,
|
||||||
|
"sessionCount": $SESSION_COUNT
|
||||||
|
}
|
||||||
|
|
||||||
|
with open('$STATE_FILE', 'w') as f:
|
||||||
|
json.dump(state, f, indent=2)
|
||||||
|
EOF
|
||||||
|
|
||||||
|
log "=== RAG Auto-Update Complete ==="
|
||||||
|
log "Total documents in knowledge base: $DOC_COUNT"
|
||||||
|
log "Next run: $(date -u -d '+24 hours' +%Y-%m-%dT%H:%M:%SZ)"
|
||||||
|
|
||||||
|
exit 0
|
||||||
Reference in New Issue
Block a user