o
    h<                     @   s~  d dl mZmZ d dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d	d
lT d dlZd dlZd dlZd dlmZmZmZ d dlmZ d dlmZmZ d dl Z d dl!Z!d dl"m#Z# d	d
l$T e%dZ&e&'ej( e%dZ)e)'ej( e%dZ*e*'ej( e%dZ+e+'ej( e%dZ,e,'ej( G dd deZ-G dd de-Z.G dd de-Z/dS )    )ABCabstractmethod)	webdriver)Service)By)WebDriverWait)expected_conditions)Options)InvalidArgumentExceptionWebDriverException   )*N)Image	ImageDraw	ImageFont)BytesIO)ListCallable)Pathz+selenium.webdriver.remote.remote_connectionz!selenium.webdriver.common.servicezurllib3.connectionpoolzhttp.clientz'selenium.webdriver.common.driver_finderc                   @   s\   e Zd ZededefddZedefddZedefd	d
ZededefddZ	dS )CrawlerStrategyurlreturnc                 K      d S N )selfr   kwargsr   r   V/var/www/Befach/backend/venv/lib/python3.10/site-packages/crawl4ai/crawler_strategy.pycrawl-      zCrawlerStrategy.crawl	save_pathc                 C   r   r   r   )r   r    r   r   r   take_screenshot1   r   zCrawlerStrategy.take_screenshot
user_agentc                 C   r   r   r   r   r"   r   r   r   update_user_agent5   r   z!CrawlerStrategy.update_user_agent	hook_typehookc                 C   r   r   r   r   r%   r&   r   r   r   set_hook9   r   zCrawlerStrategy.set_hookN)
__name__
__module____qualname__r   strr   r!   r$   r   r(   r   r   r   r   r   ,   s    r   c                       s0   e Zd Zd fdd	ZdedefddZ  ZS )	CloudCrawlerStrategyFc                    s   t    || _d S r   )super__init__use_cached_html)r   r0   	__class__r   r   r/   >   s   

zCloudCrawlerStrategy.__init__r   r   c                 C   s>   |gdddd}t jd|d}| }|d d d }t|S )	NTF)urlsinclude_raw_htmlforcedextract_blockszhttp://crawl4ai.uccode.io/crawl)jsonresultsr   html)requestspostr7   sanitize_input_encode)r   r   dataresponser9   r   r   r   r   B   s   zCloudCrawlerStrategy.crawl)F)r)   r*   r+   r/   r,   r   __classcell__r   r   r1   r   r-   =   s    r-   c                       s   e Zd Zd fdd	ZdedefddZdefd	d
ZdefddZde	fddZ
dddZdedefddZdefddZdd Z  ZS )LocalSeleniumCrawlerStrategyFNc                    s  t    td t | _d| j_|dr"| jd|d |dr3| jd|d  n|dd}| jd|  | jd |d	d| j_| jjrZ| jd
 | jd | jd | jd | jd | jd | jd | jd || _	|| _	|| _
|dd| _d d d d d d| _t | _tj| jd| _| d| j| _|dr|dD ]
}| j| qd S d S )Nu4   [LOG] 🚀 Initializing LocalSeleniumCrawlerStrategyTproxyz--proxy-server={}r"   z--user-agent=zsMozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36z~user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36headlessz
--headlessz--disable-gpuz--window-size=1920,1080z--no-sandboxz--disable-dev-shm-usagez---disable-blink-features=AutomationControlledz--log-level=3verboseF)on_driver_createdon_user_agent_updatedbefore_get_urlafter_get_urlbefore_return_html)optionsrD   cookies)r.   r/   printr	   rI   rB   getadd_argumentformatr0   js_coderC   hooksr   servicer   Chromedriverexecute_hook
add_cookie)r   r0   rO   r   r"   cookier1   r   r   r/   P   sN   



z%LocalSeleniumCrawlerStrategy.__init__r%   r&   c                 C   s&   || j v r|| j |< d S td| )NzInvalid hook type: )rP   
ValueErrorr'   r   r   r   r(      s   
z%LocalSeleniumCrawlerStrategy.set_hookc                 G   sF   | j |}|r || }|d ur t|tjr|S td| d| jS )NzHook z5 must return an instance of webdriver.Chrome or None.)rP   rL   
isinstancer   rR   	TypeErrorrS   )r   r%   argsr&   resultr   r   r   rT      s   z)LocalSeleniumCrawlerStrategy.execute_hookr"   c                 C   sD   | j d|  | j  tj| j| j d| _| d| j| _d S )Nzuser-agent=rQ   rI   rE   )rI   rM   rS   quitr   rR   rQ   rT   r#   r   r   r   r$      s   
z.LocalSeleniumCrawlerStrategy.update_user_agentheadersc                 C   s$   | j di  | j dd|i d S )NzNetwork.enablezNetwork.setExtraHTTPHeadersr^   )rS   execute_cdp_cmd)r   r^   r   r   r   set_custom_headers   s   z/LocalSeleniumCrawlerStrategy.set_custom_headers   {Gz?c                 C   sJ   t | jj}t|D ]}t| t | jj}||kr  | jjS q
| jjS r   )lenrS   page_sourcerangetimesleep)r   
max_checkscheck_intervalinitial_lengthixcurrent_lengthr   r   r   _ensure_page_load   s   
z.LocalSeleniumCrawlerStrategy._ensure_page_loadr   r   c              
   K   s  dd l }||  }| jr@tjtdt	
 dd|}tj|r@t|d}t| W  d    S 1 s;w   Y  zE| d| j| _| jrUtd| d | j| t| jd	d
d  t| jdttjdf | jd | d| j| _t|  }d}|dds|dkrtd d}t }	d|	_|	d tj | j!|	d}
|
| | d|
| _t|
j"}|
#  |d| j$| _$| j$rt%| j$t&kr| j| j$ t| jddd  n#| j$rt%| j$t'kr| j$D ]}| j| t| jddd  q|dd}|r:t(|r'td t| jd	| ntd t| jd	t)tj*|f |sCt| jj"}| d| j|| _tjtdt	
 dd|}t|ddd}|+| W d    n	1 ssw   Y  | jrtd | d! |W S  t,y } zt-|d"stt&||_.t,d#| d$|j. d }~w t/y } zt-|d"stt&||_.t/d#| d$|j. d }~w t0y } zt-|d"stt&||_.t0d#| d$|j. d }~ww )%Nr   CRAWL4_AI_BASE_DIRECTORYz	.crawl4aicacherrF   u   [LOG] 🕸️ Crawling z& using LocalSeleniumCrawlerStrategy...   c                 S      |  ddkS Nzreturn document.readyStatecompleteexecute_script)dr   r   r   <lambda>       z4LocalSeleniumCrawlerStrategy.crawl.<locals>.<lambda>
   bodyz/window.scrollTo(0, document.body.scrollHeight);rG   Fbypass_headlessz'<html><head></head><body></body></html>uQ   [LOG] 🙌 Page could not be loaded in headless mode. Trying non-headless mode...Tz--window-size=5,5r\   rO   c                 S   rr   rs   ru   rS   r   r   r   rx      ry   c                 S   rr   rs   ru   r}   r   r   r   rx     ry   wait_foru#   [LOG] 🔄 Waiting for condition...rH   wutf-8)encodingu   [LOG] ✅ Crawled z successfully!msgzFailed to crawl z: )1hashlibmd5encode	hexdigestr0   ospathjoingetenvr   homeexistsopenr<   readrT   rS   rC   rK   rL   r   untilEC presence_of_all_elements_locatedr   TAG_NAMErv   rm   r	   rB   rM   r   rR   rQ   rd   r]   rO   typer,   listcallablepresence_of_element_locatedCSS_SELECTORwriter
   hasattrr   r   	Exception)r   r   r   r   url_hashcache_file_pathfr9   can_not_be_done_headlessrI   rS   jsr~   er   r   r   r      s   
 




z"LocalSeleniumCrawlerStrategy.crawlc                 C   sd  zC| j d}| j d}| j || | j  }tt|}|d}t }|j|ddd t	
| d}| jrAtd |W S  ty } zbtd	t| }	t|	 tjdd
dd}
t|
}ztdd}W n tyz   t }Y nw d}d}t||	||}d}|j||||d t }|
j|dd t	
| d}|W  Y d }~S d }~ww )Nz return document.body.scrollWidthz!return document.body.scrollHeightRGBJPEGU   )rN   qualityr   u3   [LOG] 📸 Screenshot taken and converted to base64zFailed to take screenshot: )i   iX  black)colorz	arial.ttf(   )   r   r   i  )rz   rz   )fillfont)rN   )rS   rv   set_window_sizeget_screenshot_as_pngr   r   r   convertsavebase64	b64encodegetvaluedecoderC   rK   r   r<   r,   newr   Drawr   truetypeIOErrorload_default	wrap_texttext)r   total_widthtotal_height
screenshotimage	rgb_imagebuffered
img_base64r   error_messageimgdrawr   
text_color	max_widthwrapped_texttext_positionr   r   r   r!   ,  sD   


z,LocalSeleniumCrawlerStrategy.take_screenshotc                 C   s   | j   d S r   )rS   r]   )r   r   r   r   r]   g  s   z!LocalSeleniumCrawlerStrategy.quit)FN)ra   rb   )r)   r*   r+   r/   r,   r   r(   rT   r$   dictr`   rm   r   r!   r]   r?   r   r   r1   r   r@   O   s    O
b;r@   )0abcr   r   seleniumr   !selenium.webdriver.chrome.servicer   selenium.webdriver.common.byr   selenium.webdriver.support.uir   selenium.webdriver.supportr   r   !selenium.webdriver.chrome.optionsr	   selenium.common.exceptionsr
   r   configloggingrf   r   PILr   r   r   ior   typingr   r   r:   r   pathlibr   utils	getLoggerloggersetLevelWARNINGlogger_driverurllib3_loggerhttp_client_loggerdriver_finder_loggerr   r-   r@   r   r   r   r   <module>   s>    




