apitdn/fluig_extractor.py

import os
import requests
import time
from bs4 import BeautifulSoup
from markdownify import markdownify as md
import re
import json
from urllib.parse import urljoin, urlparse

# Configurações
BASE_URL = "https://tdn.totvs.com"
API_URL = f"{BASE_URL}/rest/api/content"
ROOT_PAGE_ID = "653566687"  # Documentação Técnica
OUTPUT_DIR = "fluig_rag_docs"
IMAGES_DIR = os.path.join(OUTPUT_DIR, "images")
DELAY = 1  # Segundos entre requisições para evitar bloqueio

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Arquivos de controle
PROGRESS_FILE = "extraction_progress.json"
URL_MAP_FILE = "url_to_path_map.json"

def get_page_children(page_id):
    url = f"{API_URL}/{page_id}/child/page"
    try:
        response = requests.get(url, headers=HEADERS, timeout=30)
        response.raise_for_status()
        return response.json().get('results', [])
    except Exception as e:
        print(f"Erro ao buscar filhos da página {page_id}: {e}")
        return []

def get_page_content(page_id):
    url = f"{API_URL}/{page_id}?expand=body.export_view"
    try:
        response = requests.get(url, headers=HEADERS, timeout=30)
        response.raise_for_status()
        data = response.json()
        title = data.get('title')
        html = data.get('body', {}).get('export_view', {}).get('value', '')

        links = data.get('_links', {})
        web_ui = links.get('webui', "")
        tiny_ui = links.get('tinyui', "")

        return title, html, web_ui, tiny_ui
    except Exception as e:
        print(f"Erro ao buscar conteúdo da página {page_id}: {e}")
        return None, None, None, None

def clean_filename(filename):
    return re.sub(r'[\\/*?:"<>|]', "", filename)

def download_image(img_url):
    """Baixa uma imagem e retorna o caminho local relativo."""
    if not img_url.startswith("http"):
        img_url = urljoin(BASE_URL, img_url)

    parsed_url = urlparse(img_url)
    img_name = clean_filename(os.path.basename(parsed_url.path))
    if not img_name:
        img_name = f"img_{hash(img_url)}.png"

    local_path = os.path.join(IMAGES_DIR, img_name)

    if os.path.exists(local_path):
        return os.path.join("images", img_name)

    try:
        img_data = requests.get(img_url, headers=HEADERS, timeout=20).content
        with open(local_path, "wb") as f:
            f.write(img_data)
        return os.path.join("images", img_name)
    except Exception as e:
        print(f"Erro ao baixar imagem {img_url}: {e}")
        return img_url

def process_links_and_images(html, current_file_dir, url_map):
    """Processa HTML para baixar imagens e preparar links locais."""
    soup = BeautifulSoup(html, "html.parser")

    # Processar Imagens
    for img in soup.find_all("img"):
        src = img.get("src")
        if src:
            local_img_path = download_image(src)
            # Ajustar para caminho relativo ao arquivo .md atual
            rel_path = os.path.relpath(os.path.join(OUTPUT_DIR, local_img_path), current_file_dir)
            img["src"] = rel_path

    # Processar Links (Mapeamento será feito em um segundo passo de "post-process")
    return str(soup)

def save_markdown(path, title, html_content, source_url):
    os.makedirs(os.path.dirname(path), exist_ok=True)

    # Converter HTML para Markdown
    markdown_content = md(html_content, heading_style="ATX", bullets="-")

    header = f"---\ntitle: {title}\nsource: {source_url}\npath: {path.replace(OUTPUT_DIR, '')}\n---\n\n"

    with open(path, "w", encoding="utf-8") as f:
        f.write(header + markdown_content)

def build_tree_map(page_id, current_path, url_map):
    """Primeiro passo: mapear todas as URLs para caminhos locais."""
    title, _, web_ui, tiny_ui = get_page_content(page_id)
    if not title: return

    safe_title = clean_filename(title)
    file_path = os.path.join(current_path, f"{safe_title}.md")

    # Mapear variados formatos de URL
    urls_to_map = []
    if web_ui:
        urls_to_map.append(web_ui)
        urls_to_map.append(urljoin(BASE_URL, web_ui))
        # Variação com display/public/fluig
        public_ui = web_ui.replace("/display/fluig/", "/display/public/fluig/")
        urls_to_map.append(public_ui)
        urls_to_map.append(urljoin(BASE_URL, public_ui))

    if tiny_ui:
        urls_to_map.append(tiny_ui)
        urls_to_map.append(urljoin(BASE_URL, tiny_ui))

    # Mapear por ID direto
    id_ui = f"/pages/viewpage.action?pageId={page_id}"
    urls_to_map.append(id_ui)
    urls_to_map.append(urljoin(BASE_URL, id_ui))

    # Short links format /x/
    if tiny_ui:
        urls_to_map.append(tiny_ui)
        urls_to_map.append(urljoin(BASE_URL, tiny_ui))

    for u in urls_to_map:
        url_map[u] = file_path

    print(f"Mapeando: {title}")

    children = get_page_children(page_id)
    new_path = os.path.join(current_path, safe_title)
    for child in children:
        build_tree_map(child['id'], new_path, url_map)
        time.sleep(0.1)

def extract_content(page_id, current_path, progress_data, url_map):
    """Segundo passo: baixar conteúdo e imagens."""
    if page_id in progress_data: return

    title, html, web_ui, tiny_ui = get_page_content(page_id)
    if not title: return

    safe_title = clean_filename(title)
    file_dir = current_path
    file_path = os.path.join(file_dir, f"{safe_title}.md")

    print(f"Extraindo: {title}")

    # Processar imagens localmente
    processed_html = process_links_and_images(html, file_dir, url_map)

    source_url = urljoin(BASE_URL, web_ui) if web_ui else ""
    save_markdown(file_path, title, processed_html, source_url)

    progress_data[page_id] = True
    with open(PROGRESS_FILE, "w") as f:
        json.dump(progress_data, f)

    time.sleep(DELAY)

    children = get_page_children(page_id)
    new_path = os.path.join(current_path, safe_title)
    for child in children:
        extract_content(child['id'], new_path, progress_data, url_map)

def post_process_links(url_map):
    """Terceiro passo: trocar links da web por links locais relativos."""
    print("\nIniciando pós-processamento de links locais...")

    # Ordenar chaves por tamanho (maiores primeiro) para evitar substituições parciais incorretas
    sorted_urls = sorted(url_map.keys(), key=len, reverse=True)

    # Pega todos os arquivos markdown na pasta de saída
    import glob
    all_files = glob.glob(os.path.join(OUTPUT_DIR, "**", "*.md"), recursive=True)

    for local_file_path in all_files:
        if not os.path.isfile(local_file_path): continue

        with open(local_file_path, "r", encoding="utf-8") as f:
            content = f.read()

        original_content = content
        current_dir = os.path.dirname(local_file_path)

        for target_url in sorted_urls:
            if target_url in content:
                target_local_path = url_map[target_url]
                # Não substituir o link para si mesmo
                if os.path.abspath(target_local_path) == os.path.abspath(local_file_path): continue

                rel_link = os.path.relpath(target_local_path, current_dir)
                rel_link = rel_link.replace("\\", "/")

                # Substituir tanto no formato [Texto](URL) quanto URL pura
                content = content.replace(f"({target_url})", f"({rel_link})")
                content = content.replace(f"\"{target_url}\"", f"\"{rel_link}\"")
                content = content.replace(f" {target_url} ", f" {rel_link} ")

        if content != original_content:
            with open(local_file_path, "w", encoding="utf-8") as f:
                f.write(content)

def main():
    if not os.path.exists(IMAGES_DIR):
        os.makedirs(IMAGES_DIR)

    url_map = {}
    if os.path.exists(URL_MAP_FILE):
        with open(URL_MAP_FILE, "r") as f:
            url_map = json.load(f)
    else:
        print("Fase 1: Mapeando árvore de URLs...")
        build_tree_map(ROOT_PAGE_ID, OUTPUT_DIR, url_map)
        with open(URL_MAP_FILE, "w") as f:
            json.dump(url_map, f)

    progress_data = {}
    if os.path.exists(PROGRESS_FILE):
        with open(PROGRESS_FILE, "r") as f:
            progress_data = json.load(f)

    print("\nFase 2: Extraindo conteúdo e imagens...")
    extract_content(ROOT_PAGE_ID, OUTPUT_DIR, progress_data, url_map)

    post_process_links(url_map)
    print("\nWiki local 100% Offline concluída!")

if __name__ == "__main__":
    main()