
    jhl+                         d dl Z d dlZd dlZd dlZd dlmZ d dlZd dlmZ ddl	m
Z
  eedg       Zg dZded	efd
Zded	efdZded	efdZded	efdZddeded	efdZded	efdZy)    N)BeautifulSoup)settings   )parse_proxy_url_for_requests
PROXY_LIST)zoMozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36zuMozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36zPMozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0urlreturnc                    t        j                  t              ddddddddd	d
d}d}t        r2t        j                  t              }t	        |      }t        d|        	 t        j                  | ||dd      }|j                          |j                  S # t        j                  $ r}t        d|        Y d}~yd}~ww xY w)zK
    Fetch product page using requests with proper headers and proxies
    zJtext/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8zen-US,en;q=0.5zgzip, deflate, brz
keep-alive1documentnavigatenonez?1z	max-age=0)z
User-AgentAcceptzAccept-LanguagezAccept-Encoding
ConnectionzUpgrade-Insecure-RequestszSec-Fetch-DestzSec-Fetch-ModezSec-Fetch-SitezSec-Fetch-UserzCache-ControlNz[Proxy] Using:    T)headersproxiestimeoutallow_redirectszError fetching page: )randomchoiceUSER_AGENTSr   r   printrequestsgetraise_for_statustextRequestException)r   r   r   proxyresponsees         3/var/www/Befach/backend/bot/utils/scrape_details.pyfetch_product_pager#      s    
 mmK0^+."%($$ $G Gj).u5w'(<< 
 	!!#}}$$ %aS)*s   !5B C*B==Chtmlc                    	 d}t        j                  || t         j                        }|rV|j                  d      }t        j                  dd|      }t        j                  dd|      }t        j                  |      }|S d}t        j                  || t         j                        }|rV|j                  d      }t        j                  dd|      }t        j                  dd|      }t        j                  |      }|S 	 i S # t
        j                  t        t         j                  f$ r}t        d|        Y d	}~i S d	}~ww xY w)
zX
    Extract product and seller data from script tags containing window.detailData.
    z$window\.detailData\s*=\s*({.*?})\s*;r   z,\s*}}z,\s*]]zdetailData\s*:\s*({.*?})\s*,zError parsing detailData: N)researchDOTALLgroupsubjsonloadsJSONDecodeErrorAttributeErrorerrorr   )r$   detail_data_patternmatchscript_datadetail_dataalt_pattern	alt_matchr!   s           r"   extract_detail_data_from_scriptr8   8   s   0E		-tRYY?++a.K&&3<K&&3<K**[1K 6IIk4;	#//!,K&&3<K&&3<K**[1K  I   ."((; 0*1#.//I0s   A>D A>D (E+D??Ec                 z   	 d}t        j                  || t         j                        }|si S |d   }t        j                  |j                               }|si S t        d|       |d   j                  d      dk(  r|d   S 	 i S # t        $ r-}ddl	}|j                          t        d|        Y d}~i S d}~ww xY w)z<
    Extract JSON-LD data from script tags in the HTML.
    z2<script type="application/ld\+json">(.*?)</script>r   zdata isz@typeProductNzError extracting JSON-LD data: )r(   findallr*   r-   r.   stripr   r   	Exception	traceback	print_exc)r$   json_ld_patternmatchesr3   datar!   r>   s          r"   extract_json_ld_datarC   Z   s    5O**_dBII>I
zz%++-(Ii7;;w9,7N - I  5/s344I5s"   *B +B 'B 	B:"B55B:c                     d}t        j                  d| t         j                        rd}t        j                  d| t         j                        rd}|S )zE
    Extract product type from HTML content without full parsing
    	PRE_ORDERzadd to cartIN_STOCKzsend inquiryINQUIRY)r(   r)   
IGNORECASE)r$   product_types     r"   extract_product_type_from_htmlrJ   s   sC     L 
yyr}}5!	yy$6     r5   c           
      4   i }	 | s|S | j                  di       }|j                  di       }|r|j                  di       }|j                  di       }|j                  dg       }|j                  di       j                  dg       }|D 	cg c]5  }	|	j                  d      |	j                  d	      |	j                  d      d
7 }}	|D 	cg c]E  }	|	j                  di       j                  d      s%|	j                  di       j                  d      G }
}	i d|j                  d      d|j                  d      d|j                  d      d|j                  d      d|j                  d      d|j                  d      d|j                  d      d|j                  di       j                  di       j                  d      d|j                  di       j                  di       j                  di g      d   j                  d       d!|j                  d"      d#g d$|j                  d%      d&|
d'|d(| j                  d)i       j                  d*i       j                  d+i       j                  d,      d-|j                  d-      d.|j                  d/      d0|rt        |      nd1i|d<   |j                  di       }i }|r|j                  d2i       |d   d3<   |j                  d4g       }|D ]  }|j                  d5      }||vrg ||<   |j                  d6g       D ]W  }||   j                  |j                  d7      |j                  d8      |j                  d5      |j                  d9      d:       Y  ||d;<   | j                  d)i       j                  d<i       j                  d+i       j                  d=g       }d>j                  |D 	cg c]%  }	|	j                  d5      s|	j                  d5      ' c}	      }||d?<   |j                  d@      |d@<   |j                  dAi       }|r|j                  dB      |j                  dC      |j                  dD      |j                  dE      |j                  dF      |j                  dG      |j                  dH      |j                  dI      dJ|dA<   |j                  dKg       }|D ci c]  }dL|v sdM|v s|dL   |dM    c}|dN<   |rit	        |      }|r\|j                  dOi       j                  dP      |dQ<   |j                  dRi       j                  d5      |dS<   |j                  dT      |dT<   |S c c}	w c c}	w c c}	w c c}w # t
        $ r}t        dU|        i cY dV}~S dV}~ww xY w)Wz_
    Parse the extracted detailData into a structured format with product and seller info.
    
globalDataproductpriceproductRangePrices
mediaItemscustomPriceproductLadderPricesminmax)rT   rU   rO   imageUrlbigtitlesubjectsku	productId	price_mindollarPriceRangeLow	price_maxdollarPriceRangeHigh	min_ordermoqunitpackage_typetradelogisticInfounitSize	lead_timeleadTimeInfoladderPeriodListr   processPeriodportdeliverPlacepayment_termssupply_abilitysupplyAbilityimagesladder_pricesample_availablenodeMapmodule_actionsprivateDataisSampleTradablesamplesample_info
sampleInforI   UNKNOWN
skuInfoMapvariantsskuAttrsnamevaluesidtype
largeImage)r   r   r~   image
attributesmodule_breadcrumb
breadCrumbz > categoryreviewsellercompanyNameaccountIsGoldPlusSupplierhasPassAssessmentbaoAccountIsServicecompanyBusinessTypecompanyJoinYearscompanyTotalRevenueTitletotalStaffNum)company_namegold_supplierverified_suppliertrade_assurancebusiness_typeyear_establishedannual_revenue	employeesproductBasicPropertiesattrName	attrValuefeaturesoffersr   
detail_urlbrand
brand_namedescriptionzError parsing detail data: N)r   rJ   appendjoinrC   r=   r   )r5   r$   resultglobal_dataproduct_info
price_inforange_pricesmedia_itemsrq   itemrp   sku_info	sku_attrsattrsattr	attr_namevalue
breadcrumbr   seller_infor   json_ld_datar!   s                          r"   parse_detail_datar      s    FhM "oolB7 #y"5%))'26J%>>*>CL&**<<K (++M2>BBCXZ\]L
 '	(  xxxx'* (L ( GRydUYU]U]^hjlUmUqUqrwUxdhhz2.2259yFy!)))4!|''4! \--.CD! \--.DE	!
 \--e4! 
v.! |''.!  < @ @QS T X XYc d! [__Wb9==nbQUUVhkmjnopqrvv  xG  H! ((8!  ! !,"2"2?"C! &! ! #KOOIr$B$F$FGWY[$\$`$`anpr$s$w$w  yK  %L!  ,**84!!" |//=#!$  >t DR[%!F9,  ##E2.	,4LLr,JF9j)LLR0E HHV,	I-+-Ii(!XXh3Ei(//#iio %		& 1 %		& 1!&<!8	1  4   )| !__Y3778KRPTTUbdfgkklxz|}
::J[D$((SYJZtxx/[\%z&??84x "ooh3 + >!,1L!M%0__5H%I#.??3H#I!,1F!G$/OO4F$G"-//2L"M(___=	 F8 "%%&>C
NX  Hd\fjn\ns~  CG  tGd:.[0AA  Hz /5L'3'7'7"'E'I'I%'P|$'3'7'7'D'H'H'P|$(4(8(8(G}%i( z^ \* H  +A3/0	st   U4 BU4 :U U4 &U%2#U%KU4 U*0U*CU4 	U/U/"U/-A2U4  U4 4	V=VVVproduct_urlc                    t        |       }|sddiS t        |      }|rt        ||      }|r|S t        |      }|r|j	                  d      |j	                  d      t        |j	                  d      t              r!|j	                  di       j	                  d      n|j	                  d      d|j	                  di       dS dd	iS )
zY
    Main function to extract product data from Alibaba product page using requests.
    r1   zFailed to fetch product pager~   r   r   )r~   r   r   r   )rN   r   zNo product data found)r#   r8   r   rC   r   
isinstancedict)r   r$   r5   parsed_datar   s        r"   extract_product_infor      s     k*D788 2$7K'T: (-L %((0+//>FPQ]QaQabiQjlpFq))'26::6B  xD  xH  xH  IP  xQ
 #&&x4
 	
 ,--rK   )N)r   r(   r-   r   bs4r   timedjango.confr   scraper_utilsr   getattrr   r   strr#   r   r8   rC   rJ   r   r    rK   r"   <module>r      s     	       7 X|R0
$C $C $L #  $  Ds t 2  n4 ns nd n`.c .d .rK   