import os import requests import time from bs4 import BeautifulSoup from markdownify import markdownify as md import re import json from urllib.parse import urljoin, urlparse # Configurações BASE_URL = "https://tdn.totvs.com" API_URL = f"{BASE_URL}/rest/api/content" ROOT_PAGE_ID = "653566687" # Documentação Técnica OUTPUT_DIR = "fluig_rag_docs" IMAGES_DIR = os.path.join(OUTPUT_DIR, "images") DELAY = 1 # Segundos entre requisições para evitar bloqueio HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" } # Arquivos de controle PROGRESS_FILE = "extraction_progress.json" URL_MAP_FILE = "url_to_path_map.json" def get_page_children(page_id): url = f"{API_URL}/{page_id}/child/page" try: response = requests.get(url, headers=HEADERS, timeout=30) response.raise_for_status() return response.json().get('results', []) except Exception as e: print(f"Erro ao buscar filhos da página {page_id}: {e}") return [] def get_page_content(page_id): url = f"{API_URL}/{page_id}?expand=body.export_view" try: response = requests.get(url, headers=HEADERS, timeout=30) response.raise_for_status() data = response.json() title = data.get('title') html = data.get('body', {}).get('export_view', {}).get('value', '') links = data.get('_links', {}) web_ui = links.get('webui', "") tiny_ui = links.get('tinyui', "") return title, html, web_ui, tiny_ui except Exception as e: print(f"Erro ao buscar conteúdo da página {page_id}: {e}") return None, None, None, None def clean_filename(filename): return re.sub(r'[\\/*?:"<>|]', "", filename) def download_image(img_url): """Baixa uma imagem e retorna o caminho local relativo.""" if not img_url.startswith("http"): img_url = urljoin(BASE_URL, img_url) parsed_url = urlparse(img_url) img_name = clean_filename(os.path.basename(parsed_url.path)) if not img_name: img_name = f"img_{hash(img_url)}.png" local_path = os.path.join(IMAGES_DIR, img_name) if os.path.exists(local_path): return os.path.join("images", img_name) try: img_data = requests.get(img_url, headers=HEADERS, timeout=20).content with open(local_path, "wb") as f: f.write(img_data) return os.path.join("images", img_name) except Exception as e: print(f"Erro ao baixar imagem {img_url}: {e}") return img_url def process_links_and_images(html, current_file_dir, url_map): """Processa HTML para baixar imagens e preparar links locais.""" soup = BeautifulSoup(html, "html.parser") # Processar Imagens for img in soup.find_all("img"): src = img.get("src") if src: local_img_path = download_image(src) # Ajustar para caminho relativo ao arquivo .md atual rel_path = os.path.relpath(os.path.join(OUTPUT_DIR, local_img_path), current_file_dir) img["src"] = rel_path # Processar Links (Mapeamento será feito em um segundo passo de "post-process") return str(soup) def save_markdown(path, title, html_content, source_url): os.makedirs(os.path.dirname(path), exist_ok=True) # Converter HTML para Markdown markdown_content = md(html_content, heading_style="ATX", bullets="-") header = f"---\ntitle: {title}\nsource: {source_url}\npath: {path.replace(OUTPUT_DIR, '')}\n---\n\n" with open(path, "w", encoding="utf-8") as f: f.write(header + markdown_content) def build_tree_map(page_id, current_path, url_map): """Primeiro passo: mapear todas as URLs para caminhos locais.""" title, _, web_ui, tiny_ui = get_page_content(page_id) if not title: return safe_title = clean_filename(title) file_path = os.path.join(current_path, f"{safe_title}.md") # Mapear variados formatos de URL urls_to_map = [] if web_ui: urls_to_map.append(web_ui) urls_to_map.append(urljoin(BASE_URL, web_ui)) # Variação com display/public/fluig public_ui = web_ui.replace("/display/fluig/", "/display/public/fluig/") urls_to_map.append(public_ui) urls_to_map.append(urljoin(BASE_URL, public_ui)) if tiny_ui: urls_to_map.append(tiny_ui) urls_to_map.append(urljoin(BASE_URL, tiny_ui)) # Mapear por ID direto id_ui = f"/pages/viewpage.action?pageId={page_id}" urls_to_map.append(id_ui) urls_to_map.append(urljoin(BASE_URL, id_ui)) # Short links format /x/ if tiny_ui: urls_to_map.append(tiny_ui) urls_to_map.append(urljoin(BASE_URL, tiny_ui)) for u in urls_to_map: url_map[u] = file_path print(f"Mapeando: {title}") children = get_page_children(page_id) new_path = os.path.join(current_path, safe_title) for child in children: build_tree_map(child['id'], new_path, url_map) time.sleep(0.1) def extract_content(page_id, current_path, progress_data, url_map): """Segundo passo: baixar conteúdo e imagens.""" if page_id in progress_data: return title, html, web_ui, tiny_ui = get_page_content(page_id) if not title: return safe_title = clean_filename(title) file_dir = current_path file_path = os.path.join(file_dir, f"{safe_title}.md") print(f"Extraindo: {title}") # Processar imagens localmente processed_html = process_links_and_images(html, file_dir, url_map) source_url = urljoin(BASE_URL, web_ui) if web_ui else "" save_markdown(file_path, title, processed_html, source_url) progress_data[page_id] = True with open(PROGRESS_FILE, "w") as f: json.dump(progress_data, f) time.sleep(DELAY) children = get_page_children(page_id) new_path = os.path.join(current_path, safe_title) for child in children: extract_content(child['id'], new_path, progress_data, url_map) def post_process_links(url_map): """Terceiro passo: trocar links da web por links locais relativos.""" print("\nIniciando pós-processamento de links locais...") # Ordenar chaves por tamanho (maiores primeiro) para evitar substituições parciais incorretas sorted_urls = sorted(url_map.keys(), key=len, reverse=True) # Pega todos os arquivos markdown na pasta de saída import glob all_files = glob.glob(os.path.join(OUTPUT_DIR, "**", "*.md"), recursive=True) for local_file_path in all_files: if not os.path.isfile(local_file_path): continue with open(local_file_path, "r", encoding="utf-8") as f: content = f.read() original_content = content current_dir = os.path.dirname(local_file_path) for target_url in sorted_urls: if target_url in content: target_local_path = url_map[target_url] # Não substituir o link para si mesmo if os.path.abspath(target_local_path) == os.path.abspath(local_file_path): continue rel_link = os.path.relpath(target_local_path, current_dir) rel_link = rel_link.replace("\\", "/") # Substituir tanto no formato [Texto](URL) quanto URL pura content = content.replace(f"({target_url})", f"({rel_link})") content = content.replace(f"\"{target_url}\"", f"\"{rel_link}\"") content = content.replace(f" {target_url} ", f" {rel_link} ") if content != original_content: with open(local_file_path, "w", encoding="utf-8") as f: f.write(content) def main(): if not os.path.exists(IMAGES_DIR): os.makedirs(IMAGES_DIR) url_map = {} if os.path.exists(URL_MAP_FILE): with open(URL_MAP_FILE, "r") as f: url_map = json.load(f) else: print("Fase 1: Mapeando árvore de URLs...") build_tree_map(ROOT_PAGE_ID, OUTPUT_DIR, url_map) with open(URL_MAP_FILE, "w") as f: json.dump(url_map, f) progress_data = {} if os.path.exists(PROGRESS_FILE): with open(PROGRESS_FILE, "r") as f: progress_data = json.load(f) print("\nFase 2: Extraindo conteúdo e imagens...") extract_content(ROOT_PAGE_ID, OUTPUT_DIR, progress_data, url_map) post_process_links(url_map) print("\nWiki local 100% Offline concluída!") if __name__ == "__main__": main()