o
    h                     @   s   d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZ dedefd	d
ZdedefddZdedefddZdedefddZdd ZdS )    N)urlparse)BeautifulSoup)async_playwright)stealth_async)human_delayproduct_urlreturnc              
   C   sR   zt d| }d}|r|d}d| W S  ty( } ztd| d}~ww )zE
    Convert a product detail URL to the description iframe URL.
    z_([0-9]+)\.html    zMhttps://www.alibaba.com/product-detail/description/descIframe.html?productId=zInvalid product URL format: N)researchgroup	Exception
ValueError)r   match
product_ide r   7/var/www/Befach/backend/bot/utils/scrape_description.pyconvert_to_description_url   s   
r   urlc              	      s   t  4 I dH j}|jjddg dddI dH }|jddd	d
ddI dH }| I dH }t|I dH  td|   |j| ddI dH  tdd td |j	dddI dH  |
 I dH }| I dH  |W  d  I dH  S 1 I dH sxw   Y  dS )zS
    Use Playwright and stealth to get HTML content after waiting for <tbody>.
    NTchrome)z--no-sandboxz---disable-blink-features=AutomationControlledz--disable-extensions2   )headlesschannelargsslow_moz[Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36i   i  )widthheightzen-US)
user_agentviewportlocaleu   ⏳ Loading domcontentloaded)
wait_untilr
      u"   ⌛ Waiting for <tbody> to load...tablei0u  )timeout)r   chromiumlaunchnew_contextnew_pager   printgotor   wait_for_selectorcontentclose)r   pbrowsercontextpagehtmlr   r   r   fetch_html_with_tbody_wait   s0   
0r5   r4   c                 C   sz   t | d}|d}|stdi }|dD ]#}|d}t|dkr:|d jdd	}|d
 jdd	}|r:|||< q|S )zO
    Extract table data (class 'all magic-1') into a key-value dictionary.
    zhtml.parserr%   z)Table with class 'all magic-1' not found.trtdr$   r   T)stripr
   )r   findr   find_alllenget_text)r4   soupr%   datarowcellskeyvaluer   r   r   parse_table_to_dict@   s   


rC   c                    s$   t | }t|I dH }t|}|S )zp
    Convert product detail URL to description URL, load it,
    and extract table data as key-value pairs.
    N)r   r5   rC   )r   description_urlr4   
table_datar   r   r    extract_product_description_dataT   s
   rF   c                    s"   i   fdd}t |   S )Nc               
      s   z$t I d H } td |  D ]\}}t| d|  | |< qW d S  ty? } ztd|  W Y d }~d S d }~ww )Nu   ✅ Extracted Table Data:z: u   ❌ Error: )rF   r+   itemsr   )r>   kvr   discriptionr   r   r   maina   s   
z!extract_description.<locals>.main)asynciorun)r   rL   r   rJ   r   extract_description^   s   
rO   )rM   r   urllib.parser   bs4r   playwright.async_apir   playwright_stealth.stealthr   bot.utils.scraper_utilsr   strr   r5   dictrC   rF   rO   r   r   r   r   <module>   s    %
