import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import csv
import os
import re
import io
from PyPDF2 import PdfReader
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock

INPUT_EXCEL = "websites.xlsx"
OUTPUT_CSV = "partial_results.csv"
FINAL_EXCEL = "scraped_fees.xlsx"

SESSION = requests.Session()
SESSION.headers.update({"User-Agent": "Mozilla/5.0"})

TIMEOUT = 6

# Most likely fee URLs first
URL_PATTERNS = [
    "/fees",
    "/fees-and-charges",
    "/our-fees",
    "/charging-structure",
    "/pricing",
    "/costs",
    "/charges",
    "/client-fees",
    "/about/fees",
    "/services/investments",
    "/how-we-work",
    "/how-it-all-works/our-fees",
"/our-approach","/financial-advice-fees"
]

FEE_REGEX = re.compile(r"(£\s*\d+(?:,\d{3})*(?:\.\d+)?)|(\d+(?:\.\d+)?\s*%)")

INITIAL_KW = ["initial", "setup", "first meeting", "one-off", "fixed fee"]
ONGOING_KW = ["ongoing", "annual", "yearly", "per year", "maintenance"]
INVEST_KW = ["platform", "custody", "investment", "fund", "ocf", "amc", "ter", "service fee"]

PDF_CACHE = {}
write_lock = Lock()


# -----------------------------
# FAST HTML FETCH
# -----------------------------
def fetch_html(url):
    try:
        r = SESSION.get(url, timeout=TIMEOUT)
        if r.status_code == 200:
            return r.text
        return None
    except Exception:
        return None


# -----------------------------
# FEE EXTRACTION
# -----------------------------
def extract_fee_from_text(text):
    if not text:
        return None, None, None

    initial = ongoing = invest = None

    for line in text.split("\n"):
        lower = line.lower()
        match = FEE_REGEX.search(line)
        if not match:
            continue

        fee = match.group().strip()

        if not initial and any(k in lower for k in INITIAL_KW):
            initial = fee
        elif not ongoing and any(k in lower for k in ONGOING_KW):
            ongoing = fee
        elif not invest and any(k in lower for k in INVEST_KW):
            invest = fee

        if initial and ongoing and invest:
            break

    return initial, ongoing, invest


# -----------------------------
# HTML PARSING
# -----------------------------
def extract_from_html(html):
    soup = BeautifulSoup(html, "html.parser")
    text = soup.get_text("\n", strip=True)
    return extract_fee_from_text(text)


# -----------------------------
# PDF EXTRACTION (cached)
# -----------------------------
def extract_from_pdf(url):
    if url in PDF_CACHE:
        return PDF_CACHE[url]

    try:
        r = SESSION.get(url, timeout=TIMEOUT)
        if r.status_code != 200:
            return None, None, None

        reader = PdfReader(io.BytesIO(r.content))
        full_text = "\n".join(page.extract_text() or "" for page in reader.pages)

        result = extract_fee_from_text(full_text)
        PDF_CACHE[url] = result
        return result

    except Exception:
        return None, None, None


# -----------------------------
# PDF LINK FINDING
# -----------------------------
def find_pdf_links(html, domain):
    soup = BeautifulSoup(html, "html.parser")
    pdfs = []

    for a in soup.find_all("a", href=True):
        href = a["href"]
        if href.lower().endswith(".pdf"):
            if href.startswith("http"):
                pdfs.append(href)
            else:
                pdfs.append(domain.rstrip("/") + "/" + href.lstrip("/"))

    return pdfs


# -----------------------------
# MAIN SCRAPER FOR ONE WEBSITE (SEQUENTIAL, DEEP)
# -----------------------------
def scrape_website(domain):
    domain = domain.strip().rstrip("/")
    if not domain.startswith("http"):
        domain = "http://" + domain

    html_initial = html_ongoing = html_invest = None
    pdf_initial = pdf_ongoing = pdf_invest = None
    pdf_initial_url = pdf_ongoing_url = pdf_invest_url = None

    any_success = False

    for pattern in URL_PATTERNS:
        url = domain + pattern
        html = fetch_html(url)
        if not html:
            continue

        any_success = True

        # HTML extraction
        i, o, inv = extract_from_html(html)
        if i and not html_initial:
            html_initial = i
        if o and not html_ongoing:
            html_ongoing = o
        if inv and not html_invest:
            html_invest = inv

        # PDF extraction
        pdf_links = find_pdf_links(html, domain)
        for pdf in pdf_links:
            pi, po, pinv = extract_from_pdf(pdf)

            if pi and not pdf_initial:
                pdf_initial = pi
                pdf_initial_url = pdf

            if po and not pdf_ongoing:
                pdf_ongoing = po
                pdf_ongoing_url = pdf

            if pinv and not pdf_invest:
                pdf_invest = pinv
                pdf_invest_url = pdf

        # Early stop if all three found from HTML
        if html_initial and html_ongoing and html_invest:
            break

    if not any_success and not any([html_initial, html_ongoing, html_invest, pdf_initial, pdf_ongoing, pdf_invest]):
        return [
            "website error", "website error", "website error",
            "website error", "website error", "website error",
            None, None, None
        ]

    return [
        html_initial, html_ongoing, html_invest,
        pdf_initial, pdf_ongoing, pdf_invest,
        pdf_initial_url, pdf_ongoing_url, pdf_invest_url
    ]


# -----------------------------
# PARALLEL SCRAPER WORKER
# -----------------------------
def process_row(row, done):
    frn = str(row["FRN"]).strip()
    firm_name = str(row["Firm Name"]).strip()
    website = str(row["Website"]).strip()

    if not website:
        return None

    if website in done:
        print(f"[SKIP] {website}")
        return None

    print(f"[SCRAPING] {website}")
    fees = scrape_website(website)

    return [frn, firm_name, website] + fees


# -----------------------------
# MAIN RUNNER (PARALLEL OVER WEBSITES)
# -----------------------------
def run_scraper():
    df = pd.read_excel(INPUT_EXCEL)

    # Ensure partial file exists AND has headers
    if not os.path.exists(OUTPUT_CSV) or os.path.getsize(OUTPUT_CSV) == 0:
        with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow([
                "FRN", "FirmName", "Website",
                "InitialFee", "OngoingFee", "InvestmentFee",
                "PDF_InitialFee", "PDF_OngoingFee", "PDF_InvestmentFee",
                "PDF_Initial_URL", "PDF_Ongoing_URL", "PDF_Invest_URL"
            ])

    # Load completed websites safely
    try:
        partial = pd.read_csv(OUTPUT_CSV)
        done = set(str(x).strip() for x in partial["Website"].dropna())
    except Exception:
        done = set()

    with ThreadPoolExecutor(max_workers=12) as executor:
        futures = {
            executor.submit(process_row, row, done): row
            for _, row in df.iterrows()
        }

        for future in as_completed(futures):
            result = future.result()
            if result is None:
                continue

            with write_lock:
                with open(OUTPUT_CSV, "a", newline="", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow(result)

    final_df = pd.read_csv(OUTPUT_CSV)
    final_df.to_excel(FINAL_EXCEL, index=False)
    print("Done.")


if __name__ == "__main__":
    run_scraper()