246 lines
8.4 KiB
Python
246 lines
8.4 KiB
Python
import os
|
|
import requests
|
|
import time
|
|
from bs4 import BeautifulSoup
|
|
from markdownify import markdownify as md
|
|
import re
|
|
import json
|
|
from urllib.parse import urljoin, urlparse
|
|
|
|
# Configurações
|
|
BASE_URL = "https://tdn.totvs.com"
|
|
API_URL = f"{BASE_URL}/rest/api/content"
|
|
ROOT_PAGE_ID = "653566687" # Documentação Técnica
|
|
OUTPUT_DIR = "fluig_rag_docs"
|
|
IMAGES_DIR = os.path.join(OUTPUT_DIR, "images")
|
|
DELAY = 1 # Segundos entre requisições para evitar bloqueio
|
|
|
|
HEADERS = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
|
}
|
|
|
|
# Arquivos de controle
|
|
PROGRESS_FILE = "extraction_progress.json"
|
|
URL_MAP_FILE = "url_to_path_map.json"
|
|
|
|
def get_page_children(page_id):
|
|
url = f"{API_URL}/{page_id}/child/page"
|
|
try:
|
|
response = requests.get(url, headers=HEADERS, timeout=30)
|
|
response.raise_for_status()
|
|
return response.json().get('results', [])
|
|
except Exception as e:
|
|
print(f"Erro ao buscar filhos da página {page_id}: {e}")
|
|
return []
|
|
|
|
def get_page_content(page_id):
|
|
url = f"{API_URL}/{page_id}?expand=body.export_view"
|
|
try:
|
|
response = requests.get(url, headers=HEADERS, timeout=30)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
title = data.get('title')
|
|
html = data.get('body', {}).get('export_view', {}).get('value', '')
|
|
|
|
links = data.get('_links', {})
|
|
web_ui = links.get('webui', "")
|
|
tiny_ui = links.get('tinyui', "")
|
|
|
|
return title, html, web_ui, tiny_ui
|
|
except Exception as e:
|
|
print(f"Erro ao buscar conteúdo da página {page_id}: {e}")
|
|
return None, None, None, None
|
|
|
|
def clean_filename(filename):
|
|
return re.sub(r'[\\/*?:"<>|]', "", filename)
|
|
|
|
def download_image(img_url):
|
|
"""Baixa uma imagem e retorna o caminho local relativo."""
|
|
if not img_url.startswith("http"):
|
|
img_url = urljoin(BASE_URL, img_url)
|
|
|
|
parsed_url = urlparse(img_url)
|
|
img_name = clean_filename(os.path.basename(parsed_url.path))
|
|
if not img_name:
|
|
img_name = f"img_{hash(img_url)}.png"
|
|
|
|
local_path = os.path.join(IMAGES_DIR, img_name)
|
|
|
|
if os.path.exists(local_path):
|
|
return os.path.join("images", img_name)
|
|
|
|
try:
|
|
img_data = requests.get(img_url, headers=HEADERS, timeout=20).content
|
|
with open(local_path, "wb") as f:
|
|
f.write(img_data)
|
|
return os.path.join("images", img_name)
|
|
except Exception as e:
|
|
print(f"Erro ao baixar imagem {img_url}: {e}")
|
|
return img_url
|
|
|
|
def process_links_and_images(html, current_file_dir, url_map):
|
|
"""Processa HTML para baixar imagens e preparar links locais."""
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
# Processar Imagens
|
|
for img in soup.find_all("img"):
|
|
src = img.get("src")
|
|
if src:
|
|
local_img_path = download_image(src)
|
|
# Ajustar para caminho relativo ao arquivo .md atual
|
|
rel_path = os.path.relpath(os.path.join(OUTPUT_DIR, local_img_path), current_file_dir)
|
|
img["src"] = rel_path
|
|
|
|
# Processar Links (Mapeamento será feito em um segundo passo de "post-process")
|
|
return str(soup)
|
|
|
|
def save_markdown(path, title, html_content, source_url):
|
|
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
|
|
# Converter HTML para Markdown
|
|
markdown_content = md(html_content, heading_style="ATX", bullets="-")
|
|
|
|
header = f"---\ntitle: {title}\nsource: {source_url}\npath: {path.replace(OUTPUT_DIR, '')}\n---\n\n"
|
|
|
|
with open(path, "w", encoding="utf-8") as f:
|
|
f.write(header + markdown_content)
|
|
|
|
def build_tree_map(page_id, current_path, url_map):
|
|
"""Primeiro passo: mapear todas as URLs para caminhos locais."""
|
|
title, _, web_ui, tiny_ui = get_page_content(page_id)
|
|
if not title: return
|
|
|
|
safe_title = clean_filename(title)
|
|
file_path = os.path.join(current_path, f"{safe_title}.md")
|
|
|
|
# Mapear variados formatos de URL
|
|
urls_to_map = []
|
|
if web_ui:
|
|
urls_to_map.append(web_ui)
|
|
urls_to_map.append(urljoin(BASE_URL, web_ui))
|
|
# Variação com display/public/fluig
|
|
public_ui = web_ui.replace("/display/fluig/", "/display/public/fluig/")
|
|
urls_to_map.append(public_ui)
|
|
urls_to_map.append(urljoin(BASE_URL, public_ui))
|
|
|
|
if tiny_ui:
|
|
urls_to_map.append(tiny_ui)
|
|
urls_to_map.append(urljoin(BASE_URL, tiny_ui))
|
|
|
|
# Mapear por ID direto
|
|
id_ui = f"/pages/viewpage.action?pageId={page_id}"
|
|
urls_to_map.append(id_ui)
|
|
urls_to_map.append(urljoin(BASE_URL, id_ui))
|
|
|
|
# Short links format /x/
|
|
if tiny_ui:
|
|
urls_to_map.append(tiny_ui)
|
|
urls_to_map.append(urljoin(BASE_URL, tiny_ui))
|
|
|
|
for u in urls_to_map:
|
|
url_map[u] = file_path
|
|
|
|
print(f"Mapeando: {title}")
|
|
|
|
children = get_page_children(page_id)
|
|
new_path = os.path.join(current_path, safe_title)
|
|
for child in children:
|
|
build_tree_map(child['id'], new_path, url_map)
|
|
time.sleep(0.1)
|
|
|
|
def extract_content(page_id, current_path, progress_data, url_map):
|
|
"""Segundo passo: baixar conteúdo e imagens."""
|
|
if page_id in progress_data: return
|
|
|
|
title, html, web_ui, tiny_ui = get_page_content(page_id)
|
|
if not title: return
|
|
|
|
safe_title = clean_filename(title)
|
|
file_dir = current_path
|
|
file_path = os.path.join(file_dir, f"{safe_title}.md")
|
|
|
|
print(f"Extraindo: {title}")
|
|
|
|
# Processar imagens localmente
|
|
processed_html = process_links_and_images(html, file_dir, url_map)
|
|
|
|
source_url = urljoin(BASE_URL, web_ui) if web_ui else ""
|
|
save_markdown(file_path, title, processed_html, source_url)
|
|
|
|
progress_data[page_id] = True
|
|
with open(PROGRESS_FILE, "w") as f:
|
|
json.dump(progress_data, f)
|
|
|
|
time.sleep(DELAY)
|
|
|
|
children = get_page_children(page_id)
|
|
new_path = os.path.join(current_path, safe_title)
|
|
for child in children:
|
|
extract_content(child['id'], new_path, progress_data, url_map)
|
|
|
|
def post_process_links(url_map):
|
|
"""Terceiro passo: trocar links da web por links locais relativos."""
|
|
print("\nIniciando pós-processamento de links locais...")
|
|
|
|
# Ordenar chaves por tamanho (maiores primeiro) para evitar substituições parciais incorretas
|
|
sorted_urls = sorted(url_map.keys(), key=len, reverse=True)
|
|
|
|
# Pega todos os arquivos markdown na pasta de saída
|
|
import glob
|
|
all_files = glob.glob(os.path.join(OUTPUT_DIR, "**", "*.md"), recursive=True)
|
|
|
|
for local_file_path in all_files:
|
|
if not os.path.isfile(local_file_path): continue
|
|
|
|
with open(local_file_path, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
|
|
original_content = content
|
|
current_dir = os.path.dirname(local_file_path)
|
|
|
|
for target_url in sorted_urls:
|
|
if target_url in content:
|
|
target_local_path = url_map[target_url]
|
|
# Não substituir o link para si mesmo
|
|
if os.path.abspath(target_local_path) == os.path.abspath(local_file_path): continue
|
|
|
|
rel_link = os.path.relpath(target_local_path, current_dir)
|
|
rel_link = rel_link.replace("\\", "/")
|
|
|
|
# Substituir tanto no formato [Texto](URL) quanto URL pura
|
|
content = content.replace(f"({target_url})", f"({rel_link})")
|
|
content = content.replace(f"\"{target_url}\"", f"\"{rel_link}\"")
|
|
content = content.replace(f" {target_url} ", f" {rel_link} ")
|
|
|
|
if content != original_content:
|
|
with open(local_file_path, "w", encoding="utf-8") as f:
|
|
f.write(content)
|
|
|
|
def main():
|
|
if not os.path.exists(IMAGES_DIR):
|
|
os.makedirs(IMAGES_DIR)
|
|
|
|
url_map = {}
|
|
if os.path.exists(URL_MAP_FILE):
|
|
with open(URL_MAP_FILE, "r") as f:
|
|
url_map = json.load(f)
|
|
else:
|
|
print("Fase 1: Mapeando árvore de URLs...")
|
|
build_tree_map(ROOT_PAGE_ID, OUTPUT_DIR, url_map)
|
|
with open(URL_MAP_FILE, "w") as f:
|
|
json.dump(url_map, f)
|
|
|
|
progress_data = {}
|
|
if os.path.exists(PROGRESS_FILE):
|
|
with open(PROGRESS_FILE, "r") as f:
|
|
progress_data = json.load(f)
|
|
|
|
print("\nFase 2: Extraindo conteúdo e imagens...")
|
|
extract_content(ROOT_PAGE_ID, OUTPUT_DIR, progress_data, url_map)
|
|
|
|
post_process_links(url_map)
|
|
print("\nWiki local 100% Offline concluída!")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|