feat: upgrade to async extractor, add RAG processing, link healing and Docker support

This commit is contained in:
rodolpho
2026-05-07 18:43:43 -03:00
parent 68dc35abbd
commit 570292d8a9
116 changed files with 16277 additions and 388 deletions
+90
View File
@@ -0,0 +1,90 @@
import os
import glob
import re
def heal_links():
docs_dir = "fluig_rag_docs"
# 1. Mapear todos os arquivos reais existentes
all_files = glob.glob(os.path.join(docs_dir, "**", "*"), recursive=True)
existing_paths = {os.path.abspath(f): f for f in all_files if os.path.isfile(f)}
md_files = [f for f in all_files if f.endswith(".md")]
# Regex que suporta parênteses aninhados (até 1 nível)
link_pattern = re.compile(r"\[(.*?)\]\(((?:[^()]+|\([^()]*\))*)\)")
healed_count = 0
print(f"Iniciando cura profunda de links em {len(md_files)} arquivos...")
for file_path in md_files:
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
current_dir = os.path.dirname(file_path)
def replace_link(match):
nonlocal healed_count
text = match.group(1)
link = match.group(2)
# Pular links externos
if link.startswith("http") or ":" in link and not link.startswith("."):
return match.group(0)
# Correção 1: Remover ".md/" no meio do caminho (bug de substituição parcial)
# Ex: Path.md/Subpath.md -> Path/Subpath.md
if ".md/" in link:
link = link.replace(".md/", "/")
# Separar âncora
parts = link.split("#")
path_part = parts[0]
anchor = "#" + parts[1] if len(parts) > 1 else ""
if not path_part: return f"[{text}]({link})"
target_path = os.path.abspath(os.path.join(current_dir, path_part))
# Se o link já existe, apenas devolve (com correção 1 aplicada se houve)
if os.path.exists(target_path):
if link != match.group(2):
healed_count += 1
return f"[{text}]({link})"
# Correção 2: Tentar adicionar extensões ou fechar parênteses
candidates = [
path_part + ".md",
path_part + ").md",
path_part + ")",
path_part.rstrip("(") + ").md"
]
for cand in candidates:
cand_path = os.path.abspath(os.path.join(current_dir, cand))
if os.path.exists(cand_path):
healed_count += 1
new_rel = os.path.relpath(cand_path, current_dir).replace("\\", "/")
return f"[{text}]({new_rel}{anchor})"
# Correção 3: Busca aproximada
base_name = os.path.basename(path_part).lower().strip()
if base_name:
for abs_p, rel_p in existing_paths.items():
target_base = os.path.basename(rel_p).lower()
if base_name in target_base:
healed_count += 1
new_rel = os.path.relpath(abs_p, current_dir).replace("\\", "/")
return f"[{text}]({new_rel}{anchor})"
return match.group(0)
new_content = link_pattern.sub(replace_link, content)
if new_content != content:
with open(file_path, "w", encoding="utf-8") as f:
f.write(new_content)
print(f"\nCura concluída! {healed_count} links foram analisados/corrigidos.")
if __name__ == "__main__":
heal_links()