import os import glob import re import json def chunk_markdown(file_path, max_chars=2000): """ Divide um arquivo Markdown em chunks baseados em cabeçalhos. Tenta manter o contexto do cabeçalho pai. """ with open(file_path, "r", encoding="utf-8") as f: content = f.read() # Extrair metadados (frontmatter) meta_match = re.match(r"^---\n(.*?)\n---\n", content, re.DOTALL) metadata = {} if meta_match: meta_text = meta_match.group(1) for line in meta_text.split("\n"): if ":" in line: key, val = line.split(":", 1) metadata[key.strip()] = val.strip() body = content[meta_match.end():] else: body = content # Dividir por cabeçalhos ## ou ### sections = re.split(r"\n(##+ .*)\n", body) chunks = [] current_chunk = metadata.get("title", "") + "\n\n" for section in sections: if len(current_chunk) + len(section) > max_chars and current_chunk.strip(): chunks.append({ "metadata": metadata, "content": current_chunk.strip() }) current_chunk = metadata.get("title", "") + "\n\n" # Reinicia com o título para contexto current_chunk += section + "\n" if current_chunk.strip(): chunks.append({ "metadata": metadata, "content": current_chunk.strip() }) return chunks def process_all_docs(output_file="fluig_chunks.json"): all_chunks = [] files = glob.glob("fluig_rag_docs/**/*.md", recursive=True) print(f"Processando {len(files)} arquivos para RAG...") for f in files: if "index.md" in f: continue chunks = chunk_markdown(f) all_chunks.extend(chunks) with open(output_file, "w", encoding="utf-8") as f: json.dump(all_chunks, f, indent=2, ensure_ascii=False) print(f"Gerados {len(all_chunks)} chunks em {output_file}") if __name__ == "__main__": process_all_docs()