Files
apitdn/fluig_extractor.py
T
2026-05-06 13:35:47 -03:00

246 lines
8.4 KiB
Python

import os
import requests
import time
from bs4 import BeautifulSoup
from markdownify import markdownify as md
import re
import json
from urllib.parse import urljoin, urlparse
# Configurações
BASE_URL = "https://tdn.totvs.com"
API_URL = f"{BASE_URL}/rest/api/content"
ROOT_PAGE_ID = "653566687" # Documentação Técnica
OUTPUT_DIR = "fluig_rag_docs"
IMAGES_DIR = os.path.join(OUTPUT_DIR, "images")
DELAY = 1 # Segundos entre requisições para evitar bloqueio
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
# Arquivos de controle
PROGRESS_FILE = "extraction_progress.json"
URL_MAP_FILE = "url_to_path_map.json"
def get_page_children(page_id):
url = f"{API_URL}/{page_id}/child/page"
try:
response = requests.get(url, headers=HEADERS, timeout=30)
response.raise_for_status()
return response.json().get('results', [])
except Exception as e:
print(f"Erro ao buscar filhos da página {page_id}: {e}")
return []
def get_page_content(page_id):
url = f"{API_URL}/{page_id}?expand=body.export_view"
try:
response = requests.get(url, headers=HEADERS, timeout=30)
response.raise_for_status()
data = response.json()
title = data.get('title')
html = data.get('body', {}).get('export_view', {}).get('value', '')
links = data.get('_links', {})
web_ui = links.get('webui', "")
tiny_ui = links.get('tinyui', "")
return title, html, web_ui, tiny_ui
except Exception as e:
print(f"Erro ao buscar conteúdo da página {page_id}: {e}")
return None, None, None, None
def clean_filename(filename):
return re.sub(r'[\\/*?:"<>|]', "", filename)
def download_image(img_url):
"""Baixa uma imagem e retorna o caminho local relativo."""
if not img_url.startswith("http"):
img_url = urljoin(BASE_URL, img_url)
parsed_url = urlparse(img_url)
img_name = clean_filename(os.path.basename(parsed_url.path))
if not img_name:
img_name = f"img_{hash(img_url)}.png"
local_path = os.path.join(IMAGES_DIR, img_name)
if os.path.exists(local_path):
return os.path.join("images", img_name)
try:
img_data = requests.get(img_url, headers=HEADERS, timeout=20).content
with open(local_path, "wb") as f:
f.write(img_data)
return os.path.join("images", img_name)
except Exception as e:
print(f"Erro ao baixar imagem {img_url}: {e}")
return img_url
def process_links_and_images(html, current_file_dir, url_map):
"""Processa HTML para baixar imagens e preparar links locais."""
soup = BeautifulSoup(html, "html.parser")
# Processar Imagens
for img in soup.find_all("img"):
src = img.get("src")
if src:
local_img_path = download_image(src)
# Ajustar para caminho relativo ao arquivo .md atual
rel_path = os.path.relpath(os.path.join(OUTPUT_DIR, local_img_path), current_file_dir)
img["src"] = rel_path
# Processar Links (Mapeamento será feito em um segundo passo de "post-process")
return str(soup)
def save_markdown(path, title, html_content, source_url):
os.makedirs(os.path.dirname(path), exist_ok=True)
# Converter HTML para Markdown
markdown_content = md(html_content, heading_style="ATX", bullets="-")
header = f"---\ntitle: {title}\nsource: {source_url}\npath: {path.replace(OUTPUT_DIR, '')}\n---\n\n"
with open(path, "w", encoding="utf-8") as f:
f.write(header + markdown_content)
def build_tree_map(page_id, current_path, url_map):
"""Primeiro passo: mapear todas as URLs para caminhos locais."""
title, _, web_ui, tiny_ui = get_page_content(page_id)
if not title: return
safe_title = clean_filename(title)
file_path = os.path.join(current_path, f"{safe_title}.md")
# Mapear variados formatos de URL
urls_to_map = []
if web_ui:
urls_to_map.append(web_ui)
urls_to_map.append(urljoin(BASE_URL, web_ui))
# Variação com display/public/fluig
public_ui = web_ui.replace("/display/fluig/", "/display/public/fluig/")
urls_to_map.append(public_ui)
urls_to_map.append(urljoin(BASE_URL, public_ui))
if tiny_ui:
urls_to_map.append(tiny_ui)
urls_to_map.append(urljoin(BASE_URL, tiny_ui))
# Mapear por ID direto
id_ui = f"/pages/viewpage.action?pageId={page_id}"
urls_to_map.append(id_ui)
urls_to_map.append(urljoin(BASE_URL, id_ui))
# Short links format /x/
if tiny_ui:
urls_to_map.append(tiny_ui)
urls_to_map.append(urljoin(BASE_URL, tiny_ui))
for u in urls_to_map:
url_map[u] = file_path
print(f"Mapeando: {title}")
children = get_page_children(page_id)
new_path = os.path.join(current_path, safe_title)
for child in children:
build_tree_map(child['id'], new_path, url_map)
time.sleep(0.1)
def extract_content(page_id, current_path, progress_data, url_map):
"""Segundo passo: baixar conteúdo e imagens."""
if page_id in progress_data: return
title, html, web_ui, tiny_ui = get_page_content(page_id)
if not title: return
safe_title = clean_filename(title)
file_dir = current_path
file_path = os.path.join(file_dir, f"{safe_title}.md")
print(f"Extraindo: {title}")
# Processar imagens localmente
processed_html = process_links_and_images(html, file_dir, url_map)
source_url = urljoin(BASE_URL, web_ui) if web_ui else ""
save_markdown(file_path, title, processed_html, source_url)
progress_data[page_id] = True
with open(PROGRESS_FILE, "w") as f:
json.dump(progress_data, f)
time.sleep(DELAY)
children = get_page_children(page_id)
new_path = os.path.join(current_path, safe_title)
for child in children:
extract_content(child['id'], new_path, progress_data, url_map)
def post_process_links(url_map):
"""Terceiro passo: trocar links da web por links locais relativos."""
print("\nIniciando pós-processamento de links locais...")
# Ordenar chaves por tamanho (maiores primeiro) para evitar substituições parciais incorretas
sorted_urls = sorted(url_map.keys(), key=len, reverse=True)
# Pega todos os arquivos markdown na pasta de saída
import glob
all_files = glob.glob(os.path.join(OUTPUT_DIR, "**", "*.md"), recursive=True)
for local_file_path in all_files:
if not os.path.isfile(local_file_path): continue
with open(local_file_path, "r", encoding="utf-8") as f:
content = f.read()
original_content = content
current_dir = os.path.dirname(local_file_path)
for target_url in sorted_urls:
if target_url in content:
target_local_path = url_map[target_url]
# Não substituir o link para si mesmo
if os.path.abspath(target_local_path) == os.path.abspath(local_file_path): continue
rel_link = os.path.relpath(target_local_path, current_dir)
rel_link = rel_link.replace("\\", "/")
# Substituir tanto no formato [Texto](URL) quanto URL pura
content = content.replace(f"({target_url})", f"({rel_link})")
content = content.replace(f"\"{target_url}\"", f"\"{rel_link}\"")
content = content.replace(f" {target_url} ", f" {rel_link} ")
if content != original_content:
with open(local_file_path, "w", encoding="utf-8") as f:
f.write(content)
def main():
if not os.path.exists(IMAGES_DIR):
os.makedirs(IMAGES_DIR)
url_map = {}
if os.path.exists(URL_MAP_FILE):
with open(URL_MAP_FILE, "r") as f:
url_map = json.load(f)
else:
print("Fase 1: Mapeando árvore de URLs...")
build_tree_map(ROOT_PAGE_ID, OUTPUT_DIR, url_map)
with open(URL_MAP_FILE, "w") as f:
json.dump(url_map, f)
progress_data = {}
if os.path.exists(PROGRESS_FILE):
with open(PROGRESS_FILE, "r") as f:
progress_data = json.load(f)
print("\nFase 2: Extraindo conteúdo e imagens...")
extract_content(ROOT_PAGE_ID, OUTPUT_DIR, progress_data, url_map)
post_process_links(url_map)
print("\nWiki local 100% Offline concluída!")
if __name__ == "__main__":
main()