68 lines
2.0 KiB
Python
68 lines
2.0 KiB
Python
|
|
import os
|
||
|
|
import glob
|
||
|
|
import re
|
||
|
|
import json
|
||
|
|
|
||
|
|
def chunk_markdown(file_path, max_chars=2000):
|
||
|
|
"""
|
||
|
|
Divide um arquivo Markdown em chunks baseados em cabeçalhos.
|
||
|
|
Tenta manter o contexto do cabeçalho pai.
|
||
|
|
"""
|
||
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
||
|
|
content = f.read()
|
||
|
|
|
||
|
|
# Extrair metadados (frontmatter)
|
||
|
|
meta_match = re.match(r"^---\n(.*?)\n---\n", content, re.DOTALL)
|
||
|
|
metadata = {}
|
||
|
|
if meta_match:
|
||
|
|
meta_text = meta_match.group(1)
|
||
|
|
for line in meta_text.split("\n"):
|
||
|
|
if ":" in line:
|
||
|
|
key, val = line.split(":", 1)
|
||
|
|
metadata[key.strip()] = val.strip()
|
||
|
|
body = content[meta_match.end():]
|
||
|
|
else:
|
||
|
|
body = content
|
||
|
|
|
||
|
|
# Dividir por cabeçalhos ## ou ###
|
||
|
|
sections = re.split(r"\n(##+ .*)\n", body)
|
||
|
|
|
||
|
|
chunks = []
|
||
|
|
current_chunk = metadata.get("title", "") + "\n\n"
|
||
|
|
|
||
|
|
for section in sections:
|
||
|
|
if len(current_chunk) + len(section) > max_chars and current_chunk.strip():
|
||
|
|
chunks.append({
|
||
|
|
"metadata": metadata,
|
||
|
|
"content": current_chunk.strip()
|
||
|
|
})
|
||
|
|
current_chunk = metadata.get("title", "") + "\n\n" # Reinicia com o título para contexto
|
||
|
|
|
||
|
|
current_chunk += section + "\n"
|
||
|
|
|
||
|
|
if current_chunk.strip():
|
||
|
|
chunks.append({
|
||
|
|
"metadata": metadata,
|
||
|
|
"content": current_chunk.strip()
|
||
|
|
})
|
||
|
|
|
||
|
|
return chunks
|
||
|
|
|
||
|
|
def process_all_docs(output_file="fluig_chunks.json"):
|
||
|
|
all_chunks = []
|
||
|
|
files = glob.glob("fluig_rag_docs/**/*.md", recursive=True)
|
||
|
|
|
||
|
|
print(f"Processando {len(files)} arquivos para RAG...")
|
||
|
|
for f in files:
|
||
|
|
if "index.md" in f: continue
|
||
|
|
chunks = chunk_markdown(f)
|
||
|
|
all_chunks.extend(chunks)
|
||
|
|
|
||
|
|
with open(output_file, "w", encoding="utf-8") as f:
|
||
|
|
json.dump(all_chunks, f, indent=2, ensure_ascii=False)
|
||
|
|
|
||
|
|
print(f"Gerados {len(all_chunks)} chunks em {output_file}")
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
process_all_docs()
|