import json
import os
import requests
import random
import re
import time

from bot.utils.data_utils import extract_query_name
from bs4 import BeautifulSoup
from bot.config import BASE_URL
from django.conf import settings
 
from urllib.parse import urlparse


def parse_proxy_url(proxy_url):
    parsed = urlparse(proxy_url)
    return {
        "server": f"{parsed.scheme}://{parsed.hostname}:{parsed.port}",
        "username": parsed.username,
        "password": parsed.password
    }

def parse_proxy_url_for_requests(proxy_url):
    """Parse proxy URL into requests-compatible format"""
    if not proxy_url:
        return None
    
    parsed = urlparse(proxy_url)
    return {
        'http': proxy_url,
        'https': proxy_url
    }

import logging

logger = logging.getLogger(__name__)

def get_alibaba_html(url, proxy=None, max_retries=5):
    """
    Fetch Alibaba product page HTML using requests with proper headers and proxies
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Cache-Control': 'max-age=0',
        'Referer': 'https://www.alibaba.com/',
    }

    for attempt in range(max_retries):
        current_proxy = proxy or random.choice(PROXY_LIST)
        try:
            proxy_config = parse_proxy_url_for_requests(current_proxy)
            print(f"[Proxy Try {attempt+1}] Using: {current_proxy}")

            # Add human-like delay between requests
            human_delay(1, 2)

            # Make the request with timeout and proxy
            response = requests.get(
                url,
                headers=headers,
                proxies=proxy_config,
                timeout=30,  # 30 second timeout
                allow_redirects=True,
                verify=True  # SSL verification
            )

            # Check for successful response
            response.raise_for_status()

            # Additional check for Alibaba-specific error pages
            if "sorry" in response.text.lower() or "captcha" in response.text.lower():
                raise Exception("Alibaba blocking detected")

            soup = BeautifulSoup(response.text, 'html.parser')

            # Find the <script> tag containing window.__page__data_sse10._offer_list
            script_tag = soup.find('script', string=re.compile(r"window\.__page__data_sse10\._offer_list"))
            script_text = script_tag.get_text() if script_tag else ''

           # Regex: capture the JSON object assigned to _offer_list
            match = re.search(
                r"window\.__page__data_sse10\._offer_list\s*=\s*(\{.*?\})\s*(?:;|$)",
                script_text,
                re.DOTALL
            )

            if match:
                offers_json = match.group(1)
                print("Found match")
                try:
                    offers = json.loads(offers_json)
                except json.JSONDecodeError as e:
                    print("❌ JSON parse error:", e)
                    return []

                final_data = offers.get('offerResultData', {}).get('offers')
                return final_data if final_data else []
            else:
                print("Not found match in the soup")
                return []

            # return "str", response.text

        except requests.exceptions.RequestException as e:
            print(f"[ProxyError] Failed with {current_proxy}: {e}")
            continue
        except Exception as e:
            print(f"[Error] Other error with {current_proxy}: {e}")
            continue

    print(f"❌ All {max_retries} attempts failed")
    return ""

import random

# Replace with your actual proxies
PROXY_LIST = settings.PROXY_LIST

def check_no_results_new(
    url: str,
    html_filename: str
    ):
    offers_list = get_alibaba_html(url)
    
    # 💾 Save HTML for debugging (append mode)
    def append_debug_html(offers_list, page_number: int, search_name: str):
        output_dir = os.path.join(settings.MEDIA_ROOT, 'output_files')
        os.makedirs(output_dir, exist_ok=True)

        full_path = os.path.join(output_dir, html_filename)
        if os.path.exists(full_path):
            with open(full_path, 'r', encoding="utf-8") as f:
                try:
                    existing = json.load(f)
                except json.JSONDecodeError:
                    existing = []
        else:
            existing = []
        
        if not isinstance(existing, list):
            existing = [existing]

        # Append new offers
        existing.extend(offers_list)
        with open(full_path, "w", encoding="utf-8") as f:
            json.dump(existing, f, ensure_ascii=False, indent=4)

    # Extract page number from URL
    try:
        page_number = int(url.split("page=")[-1])
    except:
        page_number = 0
    search_name = extract_query_name(url)
    append_debug_html(offers_list, page_number, search_name)


def fetch_and_process_page_new(
        base_url: str,
        page_number: int,
        html_filename: str
    ):
    from urllib.parse import urlparse, urlunparse, urlencode, parse_qs

    def clean_and_append_page(url: str, page_number: int) -> str:
        parsed = urlparse(url)
        query = parse_qs(parsed.query)
        query["page"] = [str(page_number)]
        clean_query = urlencode(query, doseq=True)
        clean_url = urlunparse(parsed._replace(query=clean_query))
        return clean_url
    url = clean_and_append_page(base_url, page_number)
    print(f"Loading page {page_number}...")

    # Check if "No Results Found" message is present
    no_results = check_no_results_new(url, html_filename)
    if no_results:
        return [], True  # No more results, signal to stop crawling

def human_delay(min_sec=0.5, max_sec=2):
    time.sleep(random.uniform(min_sec, max_sec))

def crawl_products_new(url: str, html_filename: str, num_pages: int, start_page: int):
    for page_number in range(start_page, start_page + num_pages):
        # Fetch and process data from the current page
        fetch_and_process_page_new(
            url,
            page_number,
            html_filename=html_filename
        )
