feat: upgrade to async extractor, add RAG processing, link healing and Docker support

2026-05-07 18:43:43 -03:00
parent 68dc35abbd
commit 570292d8a9
116 changed files with 16277 additions and 388 deletions
@@ -1,11 +1,13 @@
 import os
-import requests
+import asyncio
+import aiohttp
 import time
 from bs4 import BeautifulSoup
 from markdownify import markdownify as md
 import re
 import json
 from urllib.parse import urljoin, urlparse
+import glob

 # Configurações
 BASE_URL = "https://tdn.totvs.com"
@@ -13,7 +15,8 @@ API_URL = f"{BASE_URL}/rest/api/content"
 ROOT_PAGE_ID = "653566687"  # Documentação Técnica
 OUTPUT_DIR = "fluig_rag_docs"
 IMAGES_DIR = os.path.join(OUTPUT_DIR, "images")
-DELAY = 1  # Segundos entre requisições para evitar bloqueio
+CONCURRENCY_LIMIT = 10
+DELAY = 0.1 

 HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
@@ -23,38 +26,45 @@ HEADERS = {
 PROGRESS_FILE = "extraction_progress.json"
 URL_MAP_FILE = "url_to_path_map.json"

-def get_page_children(page_id):
-    url = f"{API_URL}/{page_id}/child/page"
-    try:
-        response = requests.get(url, headers=HEADERS, timeout=30)
-        response.raise_for_status()
-        return response.json().get('results', [])
-    except Exception as e:
-        print(f"Erro ao buscar filhos da página {page_id}: {e}")
-        return []
+semaphore = asyncio.Semaphore(CONCURRENCY_LIMIT)

-def get_page_content(page_id):
-    url = f"{API_URL}/{page_id}?expand=body.export_view"
-    try:
-        response = requests.get(url, headers=HEADERS, timeout=30)
-        response.raise_for_status()
-        data = response.json()
-        title = data.get('title')
-        html = data.get('body', {}).get('export_view', {}).get('value', '')
-        
-        links = data.get('_links', {})
-        web_ui = links.get('webui', "")
-        tiny_ui = links.get('tinyui', "")
-        
-        return title, html, web_ui, tiny_ui
-    except Exception as e:
-        print(f"Erro ao buscar conteúdo da página {page_id}: {e}")
-        return None, None, None, None
+async def fetch_json(session, url, params=None):
+    async with semaphore:
+        try:
+            async with session.get(url, params=params, headers=HEADERS, timeout=30) as response:
+                if response.status == 404:
+                    return None
+                response.raise_for_status()
+                return await response.json()
+        except Exception as e:
+            print(f"Erro ao buscar {url}: {e}")
+            return None
+
+async def get_page_children(session, page_id):
+    url = f"{API_URL}/{page_id}/child/page"
+    data = await fetch_json(session, url)
+    return data.get('results', []) if data else []
+
+async def get_page_content(session, page_id):
+    url = f"{API_URL}/{page_id}?expand=body.export_view,history.lastUpdated"
+    data = await fetch_json(session, url)
+    if not data:
+        return None, None, None, None, None
+    
+    title = data.get('title')
+    html = data.get('body', {}).get('export_view', {}).get('value', '')
+    last_updated = data.get('history', {}).get('lastUpdated', {}).get('when', '')
+    
+    links = data.get('_links', {})
+    web_ui = links.get('webui', "")
+    tiny_ui = links.get('tinyui', "")
+    
+    return title, html, web_ui, tiny_ui, last_updated

 def clean_filename(filename):
    return re.sub(r'[\\/*?:"<>|]', "", filename)

-def download_image(img_url):
+async def download_image(session, img_url):
    """Baixa uma imagem e retorna o caminho local relativo."""
    if not img_url.startswith("http"):
        img_url = urljoin(BASE_URL, img_url)
@@ -69,29 +79,105 @@ def download_image(img_url):
    if os.path.exists(local_path):
        return os.path.join("images", img_name)

-    try:
-        img_data = requests.get(img_url, headers=HEADERS, timeout=20).content
-        with open(local_path, "wb") as f:
-            f.write(img_data)
-        return os.path.join("images", img_name)
-    except Exception as e:
-        print(f"Erro ao baixar imagem {img_url}: {e}")
-        return img_url
+    async with semaphore:
+        try:
+            async with session.get(img_url, headers=HEADERS, timeout=20) as response:
+                if response.status != 200: return img_url
+                img_data = await response.read()
+                with open(local_path, "wb") as f:
+                    f.write(img_data)
+                return os.path.join("images", img_name)
+        except Exception as e:
+            print(f"Erro ao baixar imagem {img_url}: {e}")
+            return img_url

-def process_links_and_images(html, current_file_dir, url_map):
+def treat_macros(soup):
+    """Trata macros específicas do Confluence para melhorar o Markdown."""
+    # Expand macro -> HTML details
+    for expand_div in soup.find_all("div", class_="expand-container"):
+        title_div = expand_div.find("div", class_="expand-control")
+        content_div = expand_div.find("div", class_="expand-content")
+        if title_div and content_div:
+            title_text = title_div.get_text(strip=True) or "Expandir"
+            new_tag = soup.new_tag("details")
+            summary = soup.new_tag("summary")
+            summary.string = title_text
+            new_tag.append(summary)
+            # Preservar conteúdo interno
+            new_tag.append(content_div)
+            expand_div.replace_with(new_tag)
+
+    # Info/Warning macros
+    macro_mapping = {
+        "confluence-information-macro-information": "info",
+        "confluence-information-macro-note": "note",
+        "confluence-information-macro-warning": "warning",
+        "confluence-information-macro-tip": "tip"
+    }
+    for macro in soup.find_all("div", class_="confluence-information-macro"):
+        m_type = "info"
+        classes = macro.get("class", [])
+        for cls, target in macro_mapping.items():
+            if cls in classes:
+                m_type = target
+                break
+        
+        # Injetar uma marcação que podemos converter depois ou deixar como blockquote
+        # O markdownify converterá <blockquote> para > 
+        # Vamos tentar algo que o MkDocs Admonition reconheça se possível, 
+        # ou apenas deixar mais legível.
+        title_span = macro.find("span", class_="confluence-information-macro-title")
+        title = f"**{title_span.get_text(strip=True)}**\n\n" if title_span else ""
+        
+        blockquote = soup.new_tag("blockquote")
+        content_body = macro.find("div", class_="confluence-information-macro-body")
+        if content_body:
+            # Prefixar com o tipo para facilitar pós-processamento de Admonition
+            content_body.insert(0, BeautifulSoup(f"<p>!!! {m_type}</p>", "html.parser"))
+            blockquote.append(content_body)
+            macro.replace_with(blockquote)
+
+    return soup
+
+def sanitize_code_blocks(markdown_content):
+    """Tenta inferir a linguagem de blocos de código sem linguagem definida."""
+    def replace_code(match):
+        lang = match.group(1).strip()
+        code = match.group(2)
+        
+        if not lang or lang == "java": # Confluence às vezes erra java vs js
+            if any(x in code for x in ["DatasetBuilder", "createDataset", "displayFields", "getSelectedZoomItem"]):
+                lang = "javascript"
+            elif any(x in code for x in ["PreparedStatement", "ResultSet", "DriverManager"]):
+                lang = "java"
+            elif "SELECT" in code.upper() and "FROM" in code.upper():
+                lang = "sql"
+        
+        return f"```{lang}\n{code}\n```"
+
+    pattern = re.compile(r"```(.*?)\n(.*?)\n```", re.DOTALL)
+    return pattern.sub(replace_code, markdown_content)
+
+async def process_links_and_images(session, html, current_file_dir, url_map):
    """Processa HTML para baixar imagens e preparar links locais."""
    soup = BeautifulSoup(html, "html.parser")
+    soup = treat_macros(soup)
    
-    # Processar Imagens
+    tasks = []
+    imgs_to_process = []
    for img in soup.find_all("img"):
        src = img.get("src")
        if src:
-            local_img_path = download_image(src)
-            # Ajustar para caminho relativo ao arquivo .md atual
-            rel_path = os.path.relpath(os.path.join(OUTPUT_DIR, local_img_path), current_file_dir)
+            tasks.append(download_image(session, src))
+            imgs_to_process.append(img)
+            
+    if tasks:
+        local_paths = await asyncio.gather(*tasks)
+        for img, local_path in zip(imgs_to_process, local_paths):
+            if local_path.startswith("http"): continue
+            rel_path = os.path.relpath(os.path.join(OUTPUT_DIR, local_path), current_file_dir)
            img["src"] = rel_path

-    # Processar Links (Mapeamento será feito em um segundo passo de "post-process")
    return str(soup)

 def save_markdown(path, title, html_content, source_url):
@@ -100,146 +186,140 @@ def save_markdown(path, title, html_content, source_url):
    # Converter HTML para Markdown
    markdown_content = md(html_content, heading_style="ATX", bullets="-")
    
+    # Converter as marcações de Admonition que injetamos
+    markdown_content = markdown_content.replace("> !!!", "!!!")
+    
+    # Sanitização de blocos de código
+    markdown_content = sanitize_code_blocks(markdown_content)
+    
    header = f"---\ntitle: {title}\nsource: {source_url}\npath: {path.replace(OUTPUT_DIR, '')}\n---\n\n"
    
    with open(path, "w", encoding="utf-8") as f:
        f.write(header + markdown_content)

-def build_tree_map(page_id, current_path, url_map):
+async def build_tree_map(session, page_id, current_path, url_map):
    """Primeiro passo: mapear todas as URLs para caminhos locais."""
-    title, _, web_ui, tiny_ui = get_page_content(page_id)
+    title, _, web_ui, tiny_ui, _ = await get_page_content(session, page_id)
    if not title: return

    safe_title = clean_filename(title)
    file_path = os.path.join(current_path, f"{safe_title}.md")
    
-    # Mapear variados formatos de URL
-    urls_to_map = []
+    urls_to_map = [web_ui, urljoin(BASE_URL, web_ui), tiny_ui, urljoin(BASE_URL, tiny_ui)]
    if web_ui:
-        urls_to_map.append(web_ui)
-        urls_to_map.append(urljoin(BASE_URL, web_ui))
-        # Variação com display/public/fluig
        public_ui = web_ui.replace("/display/fluig/", "/display/public/fluig/")
-        urls_to_map.append(public_ui)
-        urls_to_map.append(urljoin(BASE_URL, public_ui))
-        
-    if tiny_ui:
-        urls_to_map.append(tiny_ui)
-        urls_to_map.append(urljoin(BASE_URL, tiny_ui))
+        urls_to_map.extend([public_ui, urljoin(BASE_URL, public_ui)])
    
-    # Mapear por ID direto
    id_ui = f"/pages/viewpage.action?pageId={page_id}"
-    urls_to_map.append(id_ui)
-    urls_to_map.append(urljoin(BASE_URL, id_ui))
+    urls_to_map.extend([id_ui, urljoin(BASE_URL, id_ui)])

-    # Short links format /x/
-    if tiny_ui:
-        urls_to_map.append(tiny_ui)
-        urls_to_map.append(urljoin(BASE_URL, tiny_ui))
-
-    for u in urls_to_map:
+    for u in filter(None, urls_to_map):
        url_map[u] = file_path
    
    print(f"Mapeando: {title}")
    
-    children = get_page_children(page_id)
+    children = await get_page_children(session, page_id)
    new_path = os.path.join(current_path, safe_title)
-    for child in children:
-        build_tree_map(child['id'], new_path, url_map)
-        time.sleep(0.1)
+    
+    tasks = [build_tree_map(session, child['id'], new_path, url_map) for child in children]
+    await asyncio.gather(*tasks)

-def extract_content(page_id, current_path, progress_data, url_map):
+async def extract_content(session, page_id, current_path, progress_data, url_map):
    """Segundo passo: baixar conteúdo e imagens."""
-    if page_id in progress_data: return
-
-    title, html, web_ui, tiny_ui = get_page_content(page_id)
+    title, html, web_ui, tiny_ui, last_updated = await get_page_content(session, page_id)
    if not title: return

    safe_title = clean_filename(title)
    file_dir = current_path
    file_path = os.path.join(file_dir, f"{safe_title}.md")
-    
-    print(f"Extraindo: {title}")
-    
-    # Processar imagens localmente
-    processed_html = process_links_and_images(html, file_dir, url_map)
-    
-    source_url = urljoin(BASE_URL, web_ui) if web_ui else ""
-    save_markdown(file_path, title, processed_html, source_url)
-    
-    progress_data[page_id] = True
-    with open(PROGRESS_FILE, "w") as f:
-        json.dump(progress_data, f)

-    time.sleep(DELAY)
-    
-    children = get_page_children(page_id)
+    # Extração Incremental
+    if page_id in progress_data and progress_data[page_id] == last_updated and os.path.exists(file_path):
+        print(f"Pulando (Inalterado): {title}")
+    else:
+        print(f"Extraindo: {title}")
+        processed_html = await process_links_and_images(session, html, file_dir, url_map)
+        source_url = urljoin(BASE_URL, web_ui) if web_ui else ""
+        save_markdown(file_path, title, processed_html, source_url)
+        
+        progress_data[page_id] = last_updated
+        with open(PROGRESS_FILE, "w") as f:
+            json.dump(progress_data, f)
+
+    children = await get_page_children(session, page_id)
    new_path = os.path.join(current_path, safe_title)
-    for child in children:
-        extract_content(child['id'], new_path, progress_data, url_map)
+    
+    tasks = [extract_content(session, child['id'], new_path, progress_data, url_map) for child in children]
+    await asyncio.gather(*tasks)

 def post_process_links(url_map):
-    """Terceiro passo: trocar links da web por links locais relativos."""
    print("\nIniciando pós-processamento de links locais...")
-    
-    # Ordenar chaves por tamanho (maiores primeiro) para evitar substituições parciais incorretas
    sorted_urls = sorted(url_map.keys(), key=len, reverse=True)
-    
-    # Pega todos os arquivos markdown na pasta de saída
-    import glob
    all_files = glob.glob(os.path.join(OUTPUT_DIR, "**", "*.md"), recursive=True)

+    # Regex para links Markdown que suporta parênteses aninhados
+    link_pattern = re.compile(r"\[(.*?)\]\(((?:[^()]+|\([^()]*\))*)\)")
+
    for local_file_path in all_files:
        if not os.path.isfile(local_file_path): continue
-        
        with open(local_file_path, "r", encoding="utf-8") as f:
            content = f.read()
        
        original_content = content
        current_dir = os.path.dirname(local_file_path)
        
-        for target_url in sorted_urls:
-            if target_url in content:
-                target_local_path = url_map[target_url]
-                # Não substituir o link para si mesmo
-                if os.path.abspath(target_local_path) == os.path.abspath(local_file_path): continue
-                
-                rel_link = os.path.relpath(target_local_path, current_dir)
-                rel_link = rel_link.replace("\\", "/")
-                
-                # Substituir tanto no formato [Texto](URL) quanto URL pura
-                content = content.replace(f"({target_url})", f"({rel_link})")
-                content = content.replace(f"\"{target_url}\"", f"\"{rel_link}\"")
-                content = content.replace(f" {target_url} ", f" {rel_link} ")
+        def replace_url_in_link(match):
+            text = match.group(1)
+            url = match.group(2)
+            
+            for target_url in sorted_urls:
+                if target_url in url:
+                    target_local_path = url_map[target_url]
+                    if os.path.abspath(target_local_path) == os.path.abspath(local_file_path): continue
+                    
+                    rel_link = os.path.relpath(target_local_path, current_dir).replace("\\", "/")
+                    
+                    # Se o link for uma sub-página mas o pai foi substituído por .md
+                    # Corrigimos para manter a estrutura de diretório
+                    if url.startswith(target_url + "/"):
+                        parent_dir = rel_link.replace(".md", "")
+                        new_url = url.replace(target_url, parent_dir)
+                        return f"[{text}]({new_url})"
+                    
+                    return f"[{text}]({rel_link})"
+            return match.group(0)
+
+        content = link_pattern.sub(replace_url_in_link, content)
        
        if content != original_content:
            with open(local_file_path, "w", encoding="utf-8") as f:
                f.write(content)

-def main():
+async def main():
    if not os.path.exists(IMAGES_DIR):
        os.makedirs(IMAGES_DIR)
    
-    url_map = {}
-    if os.path.exists(URL_MAP_FILE):
-        with open(URL_MAP_FILE, "r") as f:
-            url_map = json.load(f)
-    else:
-        print("Fase 1: Mapeando árvore de URLs...")
-        build_tree_map(ROOT_PAGE_ID, OUTPUT_DIR, url_map)
-        with open(URL_MAP_FILE, "w") as f:
-            json.dump(url_map, f)
+    async with aiohttp.ClientSession() as session:
+        url_map = {}
+        if os.path.exists(URL_MAP_FILE):
+            with open(URL_MAP_FILE, "r") as f:
+                url_map = json.load(f)
+        else:
+            print("Fase 1: Mapeando árvore de URLs...")
+            await build_tree_map(session, ROOT_PAGE_ID, OUTPUT_DIR, url_map)
+            with open(URL_MAP_FILE, "w") as f:
+                json.dump(url_map, f)

-    progress_data = {}
-    if os.path.exists(PROGRESS_FILE):
-        with open(PROGRESS_FILE, "r") as f:
-            progress_data = json.load(f)
+        progress_data = {}
+        if os.path.exists(PROGRESS_FILE):
+            with open(PROGRESS_FILE, "r") as f:
+                progress_data = json.load(f)

-    print("\nFase 2: Extraindo conteúdo e imagens...")
-    extract_content(ROOT_PAGE_ID, OUTPUT_DIR, progress_data, url_map)
-    
-    post_process_links(url_map)
-    print("\nWiki local 100% Offline concluída!")
+        print("\nFase 2: Extraindo conteúdo e imagens...")
+        await extract_content(session, ROOT_PAGE_ID, OUTPUT_DIR, progress_data, url_map)
+        
+        post_process_links(url_map)
+        print("\nWiki local 100% Offline concluída!")

 if __name__ == "__main__":
-    main()
+    asyncio.run(main())