feat: upgrade to async extractor, add RAG processing, link healing and Docker support

This commit is contained in:
rodolpho
2026-05-07 18:43:43 -03:00
parent 68dc35abbd
commit 570292d8a9
116 changed files with 16277 additions and 388 deletions
+210 -130
View File
@@ -1,11 +1,13 @@
import os
import requests
import asyncio
import aiohttp
import time
from bs4 import BeautifulSoup
from markdownify import markdownify as md
import re
import json
from urllib.parse import urljoin, urlparse
import glob
# Configurações
BASE_URL = "https://tdn.totvs.com"
@@ -13,7 +15,8 @@ API_URL = f"{BASE_URL}/rest/api/content"
ROOT_PAGE_ID = "653566687" # Documentação Técnica
OUTPUT_DIR = "fluig_rag_docs"
IMAGES_DIR = os.path.join(OUTPUT_DIR, "images")
DELAY = 1 # Segundos entre requisições para evitar bloqueio
CONCURRENCY_LIMIT = 10
DELAY = 0.1
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
@@ -23,38 +26,45 @@ HEADERS = {
PROGRESS_FILE = "extraction_progress.json"
URL_MAP_FILE = "url_to_path_map.json"
def get_page_children(page_id):
url = f"{API_URL}/{page_id}/child/page"
try:
response = requests.get(url, headers=HEADERS, timeout=30)
response.raise_for_status()
return response.json().get('results', [])
except Exception as e:
print(f"Erro ao buscar filhos da página {page_id}: {e}")
return []
semaphore = asyncio.Semaphore(CONCURRENCY_LIMIT)
def get_page_content(page_id):
url = f"{API_URL}/{page_id}?expand=body.export_view"
try:
response = requests.get(url, headers=HEADERS, timeout=30)
response.raise_for_status()
data = response.json()
title = data.get('title')
html = data.get('body', {}).get('export_view', {}).get('value', '')
links = data.get('_links', {})
web_ui = links.get('webui', "")
tiny_ui = links.get('tinyui', "")
return title, html, web_ui, tiny_ui
except Exception as e:
print(f"Erro ao buscar conteúdo da página {page_id}: {e}")
return None, None, None, None
async def fetch_json(session, url, params=None):
async with semaphore:
try:
async with session.get(url, params=params, headers=HEADERS, timeout=30) as response:
if response.status == 404:
return None
response.raise_for_status()
return await response.json()
except Exception as e:
print(f"Erro ao buscar {url}: {e}")
return None
async def get_page_children(session, page_id):
url = f"{API_URL}/{page_id}/child/page"
data = await fetch_json(session, url)
return data.get('results', []) if data else []
async def get_page_content(session, page_id):
url = f"{API_URL}/{page_id}?expand=body.export_view,history.lastUpdated"
data = await fetch_json(session, url)
if not data:
return None, None, None, None, None
title = data.get('title')
html = data.get('body', {}).get('export_view', {}).get('value', '')
last_updated = data.get('history', {}).get('lastUpdated', {}).get('when', '')
links = data.get('_links', {})
web_ui = links.get('webui', "")
tiny_ui = links.get('tinyui', "")
return title, html, web_ui, tiny_ui, last_updated
def clean_filename(filename):
return re.sub(r'[\\/*?:"<>|]', "", filename)
def download_image(img_url):
async def download_image(session, img_url):
"""Baixa uma imagem e retorna o caminho local relativo."""
if not img_url.startswith("http"):
img_url = urljoin(BASE_URL, img_url)
@@ -69,29 +79,105 @@ def download_image(img_url):
if os.path.exists(local_path):
return os.path.join("images", img_name)
try:
img_data = requests.get(img_url, headers=HEADERS, timeout=20).content
with open(local_path, "wb") as f:
f.write(img_data)
return os.path.join("images", img_name)
except Exception as e:
print(f"Erro ao baixar imagem {img_url}: {e}")
return img_url
async with semaphore:
try:
async with session.get(img_url, headers=HEADERS, timeout=20) as response:
if response.status != 200: return img_url
img_data = await response.read()
with open(local_path, "wb") as f:
f.write(img_data)
return os.path.join("images", img_name)
except Exception as e:
print(f"Erro ao baixar imagem {img_url}: {e}")
return img_url
def process_links_and_images(html, current_file_dir, url_map):
def treat_macros(soup):
"""Trata macros específicas do Confluence para melhorar o Markdown."""
# Expand macro -> HTML details
for expand_div in soup.find_all("div", class_="expand-container"):
title_div = expand_div.find("div", class_="expand-control")
content_div = expand_div.find("div", class_="expand-content")
if title_div and content_div:
title_text = title_div.get_text(strip=True) or "Expandir"
new_tag = soup.new_tag("details")
summary = soup.new_tag("summary")
summary.string = title_text
new_tag.append(summary)
# Preservar conteúdo interno
new_tag.append(content_div)
expand_div.replace_with(new_tag)
# Info/Warning macros
macro_mapping = {
"confluence-information-macro-information": "info",
"confluence-information-macro-note": "note",
"confluence-information-macro-warning": "warning",
"confluence-information-macro-tip": "tip"
}
for macro in soup.find_all("div", class_="confluence-information-macro"):
m_type = "info"
classes = macro.get("class", [])
for cls, target in macro_mapping.items():
if cls in classes:
m_type = target
break
# Injetar uma marcação que podemos converter depois ou deixar como blockquote
# O markdownify converterá <blockquote> para >
# Vamos tentar algo que o MkDocs Admonition reconheça se possível,
# ou apenas deixar mais legível.
title_span = macro.find("span", class_="confluence-information-macro-title")
title = f"**{title_span.get_text(strip=True)}**\n\n" if title_span else ""
blockquote = soup.new_tag("blockquote")
content_body = macro.find("div", class_="confluence-information-macro-body")
if content_body:
# Prefixar com o tipo para facilitar pós-processamento de Admonition
content_body.insert(0, BeautifulSoup(f"<p>!!! {m_type}</p>", "html.parser"))
blockquote.append(content_body)
macro.replace_with(blockquote)
return soup
def sanitize_code_blocks(markdown_content):
"""Tenta inferir a linguagem de blocos de código sem linguagem definida."""
def replace_code(match):
lang = match.group(1).strip()
code = match.group(2)
if not lang or lang == "java": # Confluence às vezes erra java vs js
if any(x in code for x in ["DatasetBuilder", "createDataset", "displayFields", "getSelectedZoomItem"]):
lang = "javascript"
elif any(x in code for x in ["PreparedStatement", "ResultSet", "DriverManager"]):
lang = "java"
elif "SELECT" in code.upper() and "FROM" in code.upper():
lang = "sql"
return f"```{lang}\n{code}\n```"
pattern = re.compile(r"```(.*?)\n(.*?)\n```", re.DOTALL)
return pattern.sub(replace_code, markdown_content)
async def process_links_and_images(session, html, current_file_dir, url_map):
"""Processa HTML para baixar imagens e preparar links locais."""
soup = BeautifulSoup(html, "html.parser")
soup = treat_macros(soup)
# Processar Imagens
tasks = []
imgs_to_process = []
for img in soup.find_all("img"):
src = img.get("src")
if src:
local_img_path = download_image(src)
# Ajustar para caminho relativo ao arquivo .md atual
rel_path = os.path.relpath(os.path.join(OUTPUT_DIR, local_img_path), current_file_dir)
tasks.append(download_image(session, src))
imgs_to_process.append(img)
if tasks:
local_paths = await asyncio.gather(*tasks)
for img, local_path in zip(imgs_to_process, local_paths):
if local_path.startswith("http"): continue
rel_path = os.path.relpath(os.path.join(OUTPUT_DIR, local_path), current_file_dir)
img["src"] = rel_path
# Processar Links (Mapeamento será feito em um segundo passo de "post-process")
return str(soup)
def save_markdown(path, title, html_content, source_url):
@@ -100,146 +186,140 @@ def save_markdown(path, title, html_content, source_url):
# Converter HTML para Markdown
markdown_content = md(html_content, heading_style="ATX", bullets="-")
# Converter as marcações de Admonition que injetamos
markdown_content = markdown_content.replace("> !!!", "!!!")
# Sanitização de blocos de código
markdown_content = sanitize_code_blocks(markdown_content)
header = f"---\ntitle: {title}\nsource: {source_url}\npath: {path.replace(OUTPUT_DIR, '')}\n---\n\n"
with open(path, "w", encoding="utf-8") as f:
f.write(header + markdown_content)
def build_tree_map(page_id, current_path, url_map):
async def build_tree_map(session, page_id, current_path, url_map):
"""Primeiro passo: mapear todas as URLs para caminhos locais."""
title, _, web_ui, tiny_ui = get_page_content(page_id)
title, _, web_ui, tiny_ui, _ = await get_page_content(session, page_id)
if not title: return
safe_title = clean_filename(title)
file_path = os.path.join(current_path, f"{safe_title}.md")
# Mapear variados formatos de URL
urls_to_map = []
urls_to_map = [web_ui, urljoin(BASE_URL, web_ui), tiny_ui, urljoin(BASE_URL, tiny_ui)]
if web_ui:
urls_to_map.append(web_ui)
urls_to_map.append(urljoin(BASE_URL, web_ui))
# Variação com display/public/fluig
public_ui = web_ui.replace("/display/fluig/", "/display/public/fluig/")
urls_to_map.append(public_ui)
urls_to_map.append(urljoin(BASE_URL, public_ui))
if tiny_ui:
urls_to_map.append(tiny_ui)
urls_to_map.append(urljoin(BASE_URL, tiny_ui))
urls_to_map.extend([public_ui, urljoin(BASE_URL, public_ui)])
# Mapear por ID direto
id_ui = f"/pages/viewpage.action?pageId={page_id}"
urls_to_map.append(id_ui)
urls_to_map.append(urljoin(BASE_URL, id_ui))
urls_to_map.extend([id_ui, urljoin(BASE_URL, id_ui)])
# Short links format /x/
if tiny_ui:
urls_to_map.append(tiny_ui)
urls_to_map.append(urljoin(BASE_URL, tiny_ui))
for u in urls_to_map:
for u in filter(None, urls_to_map):
url_map[u] = file_path
print(f"Mapeando: {title}")
children = get_page_children(page_id)
children = await get_page_children(session, page_id)
new_path = os.path.join(current_path, safe_title)
for child in children:
build_tree_map(child['id'], new_path, url_map)
time.sleep(0.1)
tasks = [build_tree_map(session, child['id'], new_path, url_map) for child in children]
await asyncio.gather(*tasks)
def extract_content(page_id, current_path, progress_data, url_map):
async def extract_content(session, page_id, current_path, progress_data, url_map):
"""Segundo passo: baixar conteúdo e imagens."""
if page_id in progress_data: return
title, html, web_ui, tiny_ui = get_page_content(page_id)
title, html, web_ui, tiny_ui, last_updated = await get_page_content(session, page_id)
if not title: return
safe_title = clean_filename(title)
file_dir = current_path
file_path = os.path.join(file_dir, f"{safe_title}.md")
print(f"Extraindo: {title}")
# Processar imagens localmente
processed_html = process_links_and_images(html, file_dir, url_map)
source_url = urljoin(BASE_URL, web_ui) if web_ui else ""
save_markdown(file_path, title, processed_html, source_url)
progress_data[page_id] = True
with open(PROGRESS_FILE, "w") as f:
json.dump(progress_data, f)
time.sleep(DELAY)
children = get_page_children(page_id)
# Extração Incremental
if page_id in progress_data and progress_data[page_id] == last_updated and os.path.exists(file_path):
print(f"Pulando (Inalterado): {title}")
else:
print(f"Extraindo: {title}")
processed_html = await process_links_and_images(session, html, file_dir, url_map)
source_url = urljoin(BASE_URL, web_ui) if web_ui else ""
save_markdown(file_path, title, processed_html, source_url)
progress_data[page_id] = last_updated
with open(PROGRESS_FILE, "w") as f:
json.dump(progress_data, f)
children = await get_page_children(session, page_id)
new_path = os.path.join(current_path, safe_title)
for child in children:
extract_content(child['id'], new_path, progress_data, url_map)
tasks = [extract_content(session, child['id'], new_path, progress_data, url_map) for child in children]
await asyncio.gather(*tasks)
def post_process_links(url_map):
"""Terceiro passo: trocar links da web por links locais relativos."""
print("\nIniciando pós-processamento de links locais...")
# Ordenar chaves por tamanho (maiores primeiro) para evitar substituições parciais incorretas
sorted_urls = sorted(url_map.keys(), key=len, reverse=True)
# Pega todos os arquivos markdown na pasta de saída
import glob
all_files = glob.glob(os.path.join(OUTPUT_DIR, "**", "*.md"), recursive=True)
# Regex para links Markdown que suporta parênteses aninhados
link_pattern = re.compile(r"\[(.*?)\]\(((?:[^()]+|\([^()]*\))*)\)")
for local_file_path in all_files:
if not os.path.isfile(local_file_path): continue
with open(local_file_path, "r", encoding="utf-8") as f:
content = f.read()
original_content = content
current_dir = os.path.dirname(local_file_path)
for target_url in sorted_urls:
if target_url in content:
target_local_path = url_map[target_url]
# Não substituir o link para si mesmo
if os.path.abspath(target_local_path) == os.path.abspath(local_file_path): continue
rel_link = os.path.relpath(target_local_path, current_dir)
rel_link = rel_link.replace("\\", "/")
# Substituir tanto no formato [Texto](URL) quanto URL pura
content = content.replace(f"({target_url})", f"({rel_link})")
content = content.replace(f"\"{target_url}\"", f"\"{rel_link}\"")
content = content.replace(f" {target_url} ", f" {rel_link} ")
def replace_url_in_link(match):
text = match.group(1)
url = match.group(2)
for target_url in sorted_urls:
if target_url in url:
target_local_path = url_map[target_url]
if os.path.abspath(target_local_path) == os.path.abspath(local_file_path): continue
rel_link = os.path.relpath(target_local_path, current_dir).replace("\\", "/")
# Se o link for uma sub-página mas o pai foi substituído por .md
# Corrigimos para manter a estrutura de diretório
if url.startswith(target_url + "/"):
parent_dir = rel_link.replace(".md", "")
new_url = url.replace(target_url, parent_dir)
return f"[{text}]({new_url})"
return f"[{text}]({rel_link})"
return match.group(0)
content = link_pattern.sub(replace_url_in_link, content)
if content != original_content:
with open(local_file_path, "w", encoding="utf-8") as f:
f.write(content)
def main():
async def main():
if not os.path.exists(IMAGES_DIR):
os.makedirs(IMAGES_DIR)
url_map = {}
if os.path.exists(URL_MAP_FILE):
with open(URL_MAP_FILE, "r") as f:
url_map = json.load(f)
else:
print("Fase 1: Mapeando árvore de URLs...")
build_tree_map(ROOT_PAGE_ID, OUTPUT_DIR, url_map)
with open(URL_MAP_FILE, "w") as f:
json.dump(url_map, f)
async with aiohttp.ClientSession() as session:
url_map = {}
if os.path.exists(URL_MAP_FILE):
with open(URL_MAP_FILE, "r") as f:
url_map = json.load(f)
else:
print("Fase 1: Mapeando árvore de URLs...")
await build_tree_map(session, ROOT_PAGE_ID, OUTPUT_DIR, url_map)
with open(URL_MAP_FILE, "w") as f:
json.dump(url_map, f)
progress_data = {}
if os.path.exists(PROGRESS_FILE):
with open(PROGRESS_FILE, "r") as f:
progress_data = json.load(f)
progress_data = {}
if os.path.exists(PROGRESS_FILE):
with open(PROGRESS_FILE, "r") as f:
progress_data = json.load(f)
print("\nFase 2: Extraindo conteúdo e imagens...")
extract_content(ROOT_PAGE_ID, OUTPUT_DIR, progress_data, url_map)
post_process_links(url_map)
print("\nWiki local 100% Offline concluída!")
print("\nFase 2: Extraindo conteúdo e imagens...")
await extract_content(session, ROOT_PAGE_ID, OUTPUT_DIR, progress_data, url_map)
post_process_links(url_map)
print("\nWiki local 100% Offline concluída!")
if __name__ == "__main__":
main()
asyncio.run(main())