
    h                        d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d Zd Zd dlZ ej&                  e      Zdd	Zd dlZej.                  Zd
edefdZdededefdZddZd
edededefdZy)    N)extract_query_name)BeautifulSoup)BASE_URL)settingsurlparsec                     t        |       }|j                   d|j                   d|j                   |j                  |j
                  dS )Nz://:)serverusernamepassword)r   schemehostnameportr   r   	proxy_urlparseds     2/var/www/Befach/backend/bot/utils/scraper_utils.pyparse_proxy_urlr      sE    i F]]O3v&7qFOOOO     c                 (    | syt        |       }| | dS )z/Parse proxy URL into requests-compatible formatN)httphttpsr   r   s     r   parse_proxy_url_for_requestsr      s"    i F r   c                    ddddddddd	d
ddd}t        |      D ]  }|xs t        j                  t              }	 t	        |      }t        d|dz    d|        t        dd       t        j                  | ||ddd      }|j                          d|j                  j                         v sd|j                  j                         v rt        d      t        |j                  d      }|j                  dt        j                   d            }	|	r|	j#                         nd}
t        j$                  d|
t        j&                        }|r]|j)                  d      }t        d       	 t+        j,                  |      }|j                  d!i       j                  d"      }|r|c S g c S t        d#       g c S  t        d'| d(       y# t*        j.                  $ r}t        d|       g cY d }~c S d }~ww xY w# t        j0                  j2                  $ r}t        d$| d%|        Y d }~d }~wt        $ r}t        d&| d%|        Y d }~6d }~ww xY w))zX
    Fetch Alibaba product page HTML using requests with proper headers and proxies
    zoMozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36zJtext/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8zen-US,en;q=0.5zgzip, deflate, brz
keep-alive1documentnavigatenonez?1z	max-age=0zhttps://www.alibaba.com/)z
User-AgentAcceptzAccept-LanguagezAccept-Encoding
ConnectionzUpgrade-Insecure-RequestszSec-Fetch-DestzSec-Fetch-ModezSec-Fetch-SitezSec-Fetch-UserzCache-ControlRefererz[Proxy Try    z	] Using:       T)headersproxiestimeoutallow_redirectsverifysorrycaptchazAlibaba blocking detectedzhtml.parserscriptz'window\.__page__data_sse10\._offer_list)string zAwindow\.__page__data_sse10\._offer_list\s*=\s*(\{.*?\})\s*(?:;|$)zFound matchu   ❌ JSON parse error:NofferResultDataofferszNot found match in the soupz[ProxyError] Failed with z: z[Error] Other error with u   ❌ All z attempts failed)rangerandomchoice
PROXY_LISTr   printhuman_delayrequestsgetraise_for_statustextlower	Exceptionr   findrecompileget_textsearchDOTALLgroupjsonloadsJSONDecodeError
exceptionsRequestException)urlproxymax_retriesr&   attemptcurrent_proxyproxy_configresponsesoup
script_tagscript_textmatchoffers_jsonr1   e
final_datas                   r   get_alibaba_htmlrX   '   sK   
 H^+."%($$ $-G %:z!:;	7FLK	{)M?CD 1  ||$ $H %%' (----//9@S@S@U3U ;<< >D 8BJJ?i4jkJ3=*--/2K IIT		E #kk!nm$!ZZ4F
 $ZZ(92>BB8L
%/z7R734	i &~ 
H[M!1
23) ++ 115I ""33 	-m_BqcBC 	-m_BqcBC	s`   D,G:*G
?$G:&G:*G:
G7G2*G7+G:2G77G::IH..I:IIrJ   html_filenamec                     t        |       }dt        dt        ffd}	 t        | j                  d      d         }t	        |       } ||||       y #  d}Y xY w)Npage_numbersearch_namec                 |   t         j                  j                  t        j                  d      }t        j
                  |d       t         j                  j                  |      }t         j                  j                  |      r.t        |dd      5 }	 t        j                  |      }d d d        ng }t        t              s|g}|j                  |        t        |dd      5 }t        j                  ||dd	
       d d d        y # t        j                  $ r g }Y ww xY w# 1 sw Y   yxY w# 1 sw Y   y xY w)Noutput_filesT)exist_okrzutf-8)encodingwF   )ensure_asciiindent)ospathjoinr   
MEDIA_ROOTmakedirsexistsopenrE   loadrG   
isinstancelistextenddump)offers_listr[   r\   
output_dir	full_pathfexistingrY   s          r   append_debug_htmlz/check_no_results_new.<locals>.append_debug_html   s
   WW\\("5"5~F

J.GGLL];	77>>)$iw7 "1"#yy|H" " H(D) zH 	$)S73 	AqIIha@	A 	A ++ "!H"" "	A 	As6   D&D(D2D# D&"D##D&&D/2D;zpage=r   )rX   intstrsplitr   )rJ   rY   rr   rw   r[   r\   s    `    r   check_no_results_newr|      sf     #3'KAC Ac A.#))G,R01 %S)Kk;<s   A Abase_urlr[   c                    	 ddl mm	mm dt
        dt        dt
        f	fd} || |      }t        d| d       t        ||      }|rg d	fS y )
Nr   )r   
urlunparse	urlencodeparse_qsrJ   r[   returnc                      |       } |j                         }t        |      g|d<    |d      } 	|j                  |            }|S )NpageT)doseq)query)r   rz   _replace)
rJ   r[   r   r   clean_query	clean_urlr   r   r   r   s
         r   clean_and_append_pagez9fetch_and_process_page_new.<locals>.clean_and_append_page   sR    #&[)*fT2v[AB	r   zLoading page z...T)	urllib.parser   r   r   r   rz   ry   r6   r|   )
r}   r[   rY   r   rJ   
no_resultsr   r   r   r   s
         @@@@r   fetch_and_process_page_newr      si    
 GF3 S S    +
6C	M+c
*+ &c=9J4x r   c                 V    t        j                  t        j                  | |             y )N)timesleepr3   uniform)min_secmax_secs     r   r7   r7      s    JJv~~gw/0r   	num_pages
start_pagec                 H    t        |||z         D ]  }t        | ||        y )N)rY   )r2   r   )rJ   rY   r   r   r[   s        r   crawl_products_newr      s*    Zi)?@"'	
 Ar   )N   )g      ?r$   )rE   rf   r8   r3   r?   r   bot.utils.data_utilsr   bs4r   
bot.configr   django.confr   r   r   r   r   logging	getLogger__name__loggerrX   r5   rz   r|   ry   r   r7   r    r   r   <module>r      s     	   	  3     !	 			8	$Sj    
#=	#=#=L ,1
C 
 
 
QT 
r   