rag_processor.py

import os
import glob
import re
import json

def chunk_markdown(file_path, max_chars=2000):
    """
    Divide um arquivo Markdown em chunks baseados em cabeçalhos.
    Tenta manter o contexto do cabeçalho pai.
    """
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()

    # Extrair metadados (frontmatter)
    meta_match = re.match(r"^---\n(.*?)\n---\n", content, re.DOTALL)
    metadata = {}
    if meta_match:
        meta_text = meta_match.group(1)
        for line in meta_text.split("\n"):
            if ":" in line:
                key, val = line.split(":", 1)
                metadata[key.strip()] = val.strip()
        body = content[meta_match.end():]
    else:
        body = content

    # Dividir por cabeçalhos ## ou ###
    sections = re.split(r"\n(##+ .*)\n", body)
    
    chunks = []
    current_chunk = metadata.get("title", "") + "\n\n"
    
    for section in sections:
        if len(current_chunk) + len(section) > max_chars and current_chunk.strip():
            chunks.append({
                "metadata": metadata,
                "content": current_chunk.strip()
            })
            current_chunk = metadata.get("title", "") + "\n\n" # Reinicia com o título para contexto
        
        current_chunk += section + "\n"

    if current_chunk.strip():
        chunks.append({
            "metadata": metadata,
            "content": current_chunk.strip()
        })
        
    return chunks

def process_all_docs(output_file="fluig_chunks.json"):
    all_chunks = []
    files = glob.glob("fluig_rag_docs/**/*.md", recursive=True)
    
    print(f"Processando {len(files)} arquivos para RAG...")
    for f in files:
        if "index.md" in f: continue
        chunks = chunk_markdown(f)
        all_chunks.extend(chunks)
    
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(all_chunks, f, indent=2, ensure_ascii=False)
    
    print(f"Gerados {len(all_chunks)} chunks em {output_file}")

if __name__ == "__main__":
    process_all_docs()
feat: upgrade to async extractor, add RAG processing, link healing and Docker support 2026-05-07 18:43:43 -03:00			`import os`
			`import glob`
			`import re`
			`import json`

			`def chunk_markdown(file_path, max_chars=2000):`
			`"""`
			`Divide um arquivo Markdown em chunks baseados em cabeçalhos.`
			`Tenta manter o contexto do cabeçalho pai.`
			`"""`
			`with open(file_path, "r", encoding="utf-8") as f:`
			`content = f.read()`

			`# Extrair metadados (frontmatter)`
			`meta_match = re.match(r"^---\n(.*?)\n---\n", content, re.DOTALL)`
			`metadata = {}`
			`if meta_match:`
			`meta_text = meta_match.group(1)`
			`for line in meta_text.split("\n"):`
			`if ":" in line:`
			`key, val = line.split(":", 1)`
			`metadata[key.strip()] = val.strip()`
			`body = content[meta_match.end():]`
			`else:`
			`body = content`

			`# Dividir por cabeçalhos ## ou ###`
			`sections = re.split(r"\n(##+ .*)\n", body)`

			`chunks = []`
			`current_chunk = metadata.get("title", "") + "\n\n"`

			`for section in sections:`
			`if len(current_chunk) + len(section) > max_chars and current_chunk.strip():`
			`chunks.append({`
			`"metadata": metadata,`
			`"content": current_chunk.strip()`
			`})`
			`current_chunk = metadata.get("title", "") + "\n\n" # Reinicia com o título para contexto`

			`current_chunk += section + "\n"`

			`if current_chunk.strip():`
			`chunks.append({`
			`"metadata": metadata,`
			`"content": current_chunk.strip()`
			`})`

			`return chunks`

			`def process_all_docs(output_file="fluig_chunks.json"):`
			`all_chunks = []`
			`files = glob.glob("fluig_rag_docs/*/.md", recursive=True)`

			`print(f"Processando {len(files)} arquivos para RAG...")`
			`for f in files:`
			`if "index.md" in f: continue`
			`chunks = chunk_markdown(f)`
			`all_chunks.extend(chunks)`

			`with open(output_file, "w", encoding="utf-8") as f:`
			`json.dump(all_chunks, f, indent=2, ensure_ascii=False)`

			`print(f"Gerados {len(all_chunks)} chunks em {output_file}")`

			`if __name__ == "__main__":`
			`process_all_docs()`