o
    h                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ dd Zd	d
 Zd dlZeeZdddZd dlZejZdedefddZdededefddZdddZdedededefddZdS )     N)extract_query_name)BeautifulSoup)BASE_URL)settingsurlparsec                 C   s.   t | }|j d|j d|j |j|jdS )Nz://:)serverusernamepassword)r   schemehostnameportr
   r   	proxy_urlparsed r   2/var/www/Befach/backend/bot/utils/scraper_utils.pyparse_proxy_url   s
   r   c                 C   s   | sdS t | }| | dS )z/Parse proxy URL into requests-compatible formatN)httphttpsr   r   r   r   r   parse_proxy_url_for_requests   s   r      c                 C   s  ddddddddd	d
ddd}t |D ]}|ptt}zt|}td|d  d|  tdd tj| ||dddd}|	  d|j
 v sOd|j
 v rStdt|j
d}|jdtdd}	|	ri|	 nd}
td|
tj}|r|d}td zt|}W n tjy } ztd| g W  Y d }~W   S d }~ww |d!i d"}|r|W   S g W   S td# g W   S  tjjy } ztd$| d%|  W Y d }~qd }~w ty } ztd&| d%|  W Y d }~qd }~ww td'| d( dS ))zX
    Fetch Alibaba product page HTML using requests with proper headers and proxies
    zoMozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36zJtext/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8zen-US,en;q=0.5zgzip, deflate, brz
keep-alive1documentnavigatenonez?1z	max-age=0zhttps://www.alibaba.com/)z
User-AgentAcceptzAccept-LanguagezAccept-Encoding
ConnectionzUpgrade-Insecure-RequestszSec-Fetch-DestzSec-Fetch-ModezSec-Fetch-SitezSec-Fetch-UserzCache-ControlRefererz[Proxy Try    z	] Using:       T)headersproxiestimeoutallow_redirectsverifysorrycaptchazAlibaba blocking detectedzhtml.parserscriptz'window\.__page__data_sse10\._offer_list)string zAwindow\.__page__data_sse10\._offer_list\s*=\s*(\{.*?\})\s*(?:;|$)zFound matchu   ❌ JSON parse error:NofferResultDataofferszNot found match in the soupz[ProxyError] Failed with z: z[Error] Other error with u   ❌ All z attempts failed)rangerandomchoice
PROXY_LISTr   printhuman_delayrequestsgetraise_for_statustextlower	Exceptionr   findrecompileget_textsearchDOTALLgroupjsonloadsJSONDecodeError
exceptionsRequestException)urlproxymax_retriesr#   attemptcurrent_proxyproxy_configresponsesoup
script_tagscript_textmatchoffers_jsonr.   e
final_datar   r   r   get_alibaba_html'   s~   




rU   rG   html_filenamec                    sZ   t | }dtdtf fdd}zt| dd }W n   d}Y t| }|||| d S )Npage_numbersearch_namec              	      s   t jtjd}t j|dd t j| }t j|rHt|ddd}zt	|}W n tj
y7   g }Y nw W d    n1 sBw   Y  ng }t|tsR|g}||  t|ddd}tj||dd	d
 W d    d S 1 ssw   Y  d S )Noutput_filesT)exist_okrzutf-8)encodingwF   )ensure_asciiindent)ospathjoinr   
MEDIA_ROOTmakedirsexistsopenrB   loadrD   
isinstancelistextenddump)offers_listrW   rX   
output_dir	full_pathfexistingrV   r   r   append_debug_html   s(   

"z/check_no_results_new.<locals>.append_debug_htmlzpage=r   )rU   intstrsplitr   )rG   rV   rm   rs   rW   rX   r   rr   r   check_no_results_new   s   rx   base_urlrW   c                    sl   ddl mmmm  dtdtdtf fdd}|| |}td| d	 t||}|r4g d
fS d S )Nr   )r   
urlunparse	urlencodeparse_qsrG   rW   returnc                    s@   | } |j }t|g|d< |dd}|j|d}|S )NpageT)doseq)query)r   rv   _replace)rG   rW   r   r   clean_query	clean_urlr|   r{   r   rz   r   r   clean_and_append_page   s   
z9fetch_and_process_page_new.<locals>.clean_and_append_pagezLoading page z...T)	urllib.parser   rz   r{   r|   rv   ru   r3   rx   )ry   rW   rV   r   rG   
no_resultsr   r   r   fetch_and_process_page_new   s    

r         ?r!   c                 C   s   t t| | d S )N)timesleepr0   uniform)min_secmax_secr   r   r   r4      s   r4   	num_pages
start_pagec                 C   s&   t ||| D ]	}t| ||d qd S )Nrr   )r/   r   )rG   rV   r   r   rW   r   r   r   crawl_products_new   s   r   )Nr   )r   r!   )rB   ra   r5   r0   r<   r   bot.utils.data_utilsr   bs4r   
bot.configr   django.confr   r   r   r   r   logging	getLogger__name__loggerrU   r2   rv   rx   ru   r   r4   r   r   r   r   r   <module>   s@    

U
&

