feat: upgrade to async extractor, add RAG processing, link healing and Docker support

2026-05-07 18:43:43 -03:00
parent 68dc35abbd
commit 570292d8a9
116 changed files with 16277 additions and 388 deletions
@@ -0,0 +1,67 @@
+import os
+import glob
+import re
+import json
+
+def chunk_markdown(file_path, max_chars=2000):
+    """
+    Divide um arquivo Markdown em chunks baseados em cabeçalhos.
+    Tenta manter o contexto do cabeçalho pai.
+    """
+    with open(file_path, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    # Extrair metadados (frontmatter)
+    meta_match = re.match(r"^---\n(.*?)\n---\n", content, re.DOTALL)
+    metadata = {}
+    if meta_match:
+        meta_text = meta_match.group(1)
+        for line in meta_text.split("\n"):
+            if ":" in line:
+                key, val = line.split(":", 1)
+                metadata[key.strip()] = val.strip()
+        body = content[meta_match.end():]
+    else:
+        body = content
+
+    # Dividir por cabeçalhos ## ou ###
+    sections = re.split(r"\n(##+ .*)\n", body)
+    
+    chunks = []
+    current_chunk = metadata.get("title", "") + "\n\n"
+    
+    for section in sections:
+        if len(current_chunk) + len(section) > max_chars and current_chunk.strip():
+            chunks.append({
+                "metadata": metadata,
+                "content": current_chunk.strip()
+            })
+            current_chunk = metadata.get("title", "") + "\n\n" # Reinicia com o título para contexto
+        
+        current_chunk += section + "\n"
+
+    if current_chunk.strip():
+        chunks.append({
+            "metadata": metadata,
+            "content": current_chunk.strip()
+        })
+        
+    return chunks
+
+def process_all_docs(output_file="fluig_chunks.json"):
+    all_chunks = []
+    files = glob.glob("fluig_rag_docs/**/*.md", recursive=True)
+    
+    print(f"Processando {len(files)} arquivos para RAG...")
+    for f in files:
+        if "index.md" in f: continue
+        chunks = chunk_markdown(f)
+        all_chunks.extend(chunks)
+    
+    with open(output_file, "w", encoding="utf-8") as f:
+        json.dump(all_chunks, f, indent=2, ensure_ascii=False)
+    
+    print(f"Gerados {len(all_chunks)} chunks em {output_file}")
+
+if __name__ == "__main__":
+    process_all_docs()