import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import os

INPUT_FILE = "firms.xlsx"
OUTPUT_FILE = "firm_services_output.xlsx"
PARTIAL_FILE = "firm_services_partial.xlsx"

HEADERS = {"User-Agent": "Mozilla/5.0"}

# Common service page paths
SERVICE_PATHS = [
    "/our-services",
    "/services",
    "/service",
    "/what-we-do",
    "/our-service-proposition",
    "/financial-planning",
    "/advice",
    "/solutions",
    "/how-we-help",
     "/financial-advice",
    "/financial-planning",
]

# Keywords for each service category
SERVICE_KEYWORDS = {
     "Investment": ["investment", "investment planning", "investment advice", "investment bond", "ISA", "wealth mangement"],
    "Retirement & Pension Advice": ["pension", "retirement planning", "Income Protection","pension transfer", "defined benefit", "defined benefit transfer"],
    "Protection": ["protection", "life protection","life insurance", "critical illness cover", "health insurance"],
    "Tax planning": ["tax planning", "income tax", "capital gain tax", "inheritance tax", "tax & estate planning", ],
    "Estate Planning": ["estate planning", "estate advice", "estate management","Long-term care planning","Inheritance Planning", "trust and estates", "later life advice"],
    "Complex Financial Planning": ["complex financial planning", "advanced planning", "comprehensive planning","equity release", "Long Term Care", "Advance Decision", "Living Will", "Disabled Discretionary Trust"]
 }

def fetch_page(url):
    try:
        if not url.startswith("http"):
            url = "http://" + url
        r = requests.get(url, headers=HEADERS, timeout=8)
        if r.status_code != 200:
            return None
        return r.text
    except:
        return None

def extract_text(html):
    try:
        soup = BeautifulSoup(html, "html.parser")
        return soup.get_text(" ", strip=True).lower()
    except:
        return ""

def find_service_page(base_url):
    for path in SERVICE_PATHS:
        test_url = base_url.rstrip("/") + path
        html = fetch_page(test_url)
        if html:
            return html
    return None

def detect_services(text):
    results = {}
    for service, keywords in SERVICE_KEYWORDS.items():
        results[service] = "Yes" if any(k in text for k in keywords) else "No"
    return results

def save_partial(rows):
    df = pd.DataFrame(rows)
    df.to_excel(PARTIAL_FILE, index=False)

def main():
    df = pd.read_excel(INPUT_FILE)
    output_rows = []

    # If partial file exists, load it and skip completed firms
    if os.path.exists(PARTIAL_FILE):
        partial_df = pd.read_excel(PARTIAL_FILE)
        completed_frns = set(partial_df["FRN"])
        output_rows = partial_df.to_dict("records")
    else:
        completed_frns = set()

    for _, row in df.iterrows():
        frn = row["FRN"]

        if frn in completed_frns:
            continue  # skip already processed firms

        name = row["Firm Name"]
        website = str(row["Website"]).strip()

        print(f"Scanning: {name}")

        if website.lower() in ["", "nan", "no website listed"]:
            text = ""
        else:
            service_html = find_service_page(website)
            if service_html:
                text = extract_text(service_html)
            else:
                homepage_html = fetch_page(website)
                text = extract_text(homepage_html) if homepage_html else ""

        services = detect_services(text)

        row_data = {
            "FRN": frn,
            "Firm Name": name,
            "Website": website,
            **services
        }

        output_rows.append(row_data)

        # Save partial progress after each firm
        save_partial(output_rows)

        time.sleep(1)

    # Save final output
    final_df = pd.DataFrame(output_rows)
    final_df.to_excel(OUTPUT_FILE, index=False)

    print("Service scanning completed.")

if __name__ == "__main__":
    main()