o
    jhl+                     @   s   d dl Z d dlZd dlZd dlZd dlmZ d dlZd dlmZ ddl	m
Z
 eedg Zg dZded	efd
dZded	efddZded	efddZded	efddZddeded	efddZded	efddZdS )    N)BeautifulSoup)settings   )parse_proxy_url_for_requests
PROXY_LIST)zoMozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36zuMozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36zPMozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0urlreturnc                 C   s   t tddddddddd	d
d}d}tr%t t}t|}td|  ztj| ||ddd}|  |j	W S  tj
yR } ztd|  W Y d}~dS d}~ww )zK
    Fetch product page using requests with proper headers and proxies
    zJtext/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8zen-US,en;q=0.5zgzip, deflate, brz
keep-alive1documentnavigatenonez?1z	max-age=0)z
User-AgentAcceptzAccept-LanguagezAccept-Encoding
ConnectionzUpgrade-Insecure-RequestszSec-Fetch-DestzSec-Fetch-ModezSec-Fetch-SitezSec-Fetch-UserzCache-ControlNz[Proxy] Using:    T)headersproxiestimeoutallow_redirectszError fetching page: )randomchoiceUSER_AGENTSr   r   printrequestsgetraise_for_statustextRequestException)r   r   r   proxyresponsee r    3/var/www/Befach/backend/bot/utils/scrape_details.pyfetch_product_page   s@   
r"   htmlc              
   C   s   zQd}t || t j}|r(|d}t dd|}t dd|}t|}|W S d}t || t j}|rO|d}t dd|}t dd|}t|}|W S W i S  tjtt j	fyp } zt
d|  W Y d	}~i S d	}~ww )
zX
    Extract product and seller data from script tags containing window.detailData.
    z$window\.detailData\s*=\s*({.*?})\s*;r   z,\s*}}z,\s*]]zdetailData\s*:\s*({.*?})\s*,zError parsing detailData: N)researchDOTALLgroupsubjsonloadsJSONDecodeErrorAttributeErrorerrorr   )r#   detail_data_patternmatchscript_datadetail_dataalt_pattern	alt_matchr   r    r    r!   extract_detail_data_from_script8   s0   



r6   c              
   C   s   z5d}t || t j}|si W S |d }t| }|s i W S td| |d ddkr3|d W S W i S  tyW } zddl	}|
  td|  W Y d}~i S d}~ww )z<
    Extract JSON-LD data from script tags in the HTML.
    z2<script type="application/ld\+json">(.*?)</script>r   zdata isz@typeProductNzError extracting JSON-LD data: )r&   findallr(   r+   r,   stripr   r   	Exception	traceback	print_exc)r#   json_ld_patternmatchesr1   datar   r;   r    r    r!   extract_json_ld_dataZ   s*   

r@   c                 C   s0   d}t d| t jrd}t d| t jrd}|S )zE
    Extract product type from HTML content without full parsing
    	PRE_ORDERzadd to cartIN_STOCKzsend inquiryINQUIRY)r&   r'   
IGNORECASE)r#   product_typer    r    r!   extract_product_type_from_htmls   s   rF   r3   c              
   C   s  i }z| s	|W S |  di }| di }|r| di }| di }| dg }| di  dg }dd	 |D }d
d	 |D }	i d| dd| dd| dd| dd| dd| dd| dd| di  di  dd| di  di  di gd  dd| d d!g d"| d#d$|	d%|d&|  d'i  d(i  d)i  d*d+| d+d,| d-d.|rt|nd/i|d< | di }
i }|
r|
 d0i |d d1< |
 d2g }|D ]1}| d3}||vrg ||< | d4g D ]}|| | d5| d6| d3| d7d8 qq||d9< |  d'i  d:i  d)i  d;g }d<d=d	 |D }||d>< | d?|d?< | d@i }|rw| dA| dB| dC| dD| dE| dF| dG| dHdI|d@< | dJg }dKdL |D |dM< |rt|}|r| dNi  dO|dP< | dQi  d3|dR< | dS|dS< |W S  ty } ztdT|  i W  Y dU}~S dU}~ww )Vz_
    Parse the extracted detailData into a structured format with product and seller info.
    
globalDataproductpriceproductRangePrices
mediaItemscustomPriceproductLadderPricesc                 S   s*   g | ]}| d | d| ddqS )minmaxrI   )rN   rO   rI   r   .0itemr    r    r!   
<listcomp>   s    
z%parse_detail_data.<locals>.<listcomp>c                 S   s0   g | ]}| d i  dr| d i  dqS )imageUrlbigrP   rQ   r    r    r!   rT      s   0 titlesubjectsku	productId	price_mindollarPriceRangeLow	price_maxdollarPriceRangeHigh	min_ordermoqunitpackage_typetradelogisticInfounitSize	lead_timeleadTimeInfoladderPeriodListr   processPeriodportdeliverPlacepayment_termssupply_abilitysupplyAbilityimagesladder_pricesample_availablenodeMapmodule_actionsprivateDataisSampleTradablesamplesample_info
sampleInforE   UNKNOWN
skuInfoMapvariantsskuAttrsnamevaluesidtype
largeImage)r   r   r}   image
attributesmodule_breadcrumb
breadCrumbz > c                 S   s    g | ]}| d r| d qS )r}   rP   rQ   r    r    r!   rT      s     categoryreviewsellercompanyNameaccountIsGoldPlusSupplierhasPassAssessmentbaoAccountIsServicecompanyBusinessTypecompanyJoinYearscompanyTotalRevenueTitletotalStaffNum)company_namegold_supplierverified_suppliertrade_assurancebusiness_typeyear_establishedannual_revenue	employeesproductBasicPropertiesc                 S   s*   i | ]}d |v rd|v r|d  |d qS )attrName	attrValuer    )rR   attrr    r    r!   
<dictcomp>   s   * z%parse_detail_data.<locals>.<dictcomp>featuresoffersr   
detail_urlbrand
brand_namedescriptionzError parsing detail data: N)r   rF   appendjoinr@   r:   r   )r3   r#   resultglobal_dataproduct_info
price_inforange_pricesmedia_itemsrp   ro   sku_info	sku_attrsattrsr   	attr_namevalue
breadcrumbr   seller_infor   json_ld_datar   r    r    r!   parse_detail_data   s   






(	


"



$
r   product_urlc                 C   s   t | }|s
ddiS t|}|rt||}|r|S t|}|rF|d|dt|dtr8|di dn|dd|di dS dd	iS )
zY
    Main function to extract product data from Alibaba product page using requests.
    r/   zFailed to fetch product pager}   r   r   )r}   r   r   r   )rH   r   zNo product data found)r"   r6   r   r@   r   
isinstancedict)r   r#   r3   parsed_datar   r    r    r!   extract_product_info   s"   
*
	r   )N)r   r&   r+   r   bs4r   timedjango.confr   scraper_utilsr   getattrr   r   strr"   r   r6   r@   rF   r   r   r    r    r    r!   <module>   s     &"p