o
    h1)                     @   s   d dl Z d dlZde jd< d dlmZ ddlmZmZ ddlm	Z	m
Z
mZmZmZ ddlT ddlT ddlT ddlT d d	lmZ d d
lmZ ddlmZ ddlT d dlZd dlZejddd G dd dZdS )    NfalseTOKENIZERS_PARALLELISM)Path   )UrlModelCrawlResult)init_dbget_cached_url	cache_urlDB_PATHflush_db)*)List)ThreadPoolExecutor)WebScrapingStrategyignorezBField "model_name" has conflict with protected namespace "model_".)messagec                   @   s8  e Zd Zd%dededefddZdd	 Zedd
edddde	 f	de
dededededededededefddZedd
edddde	 f	dee
 dededededededededee fddZede	 ddddd
fdedededededededefddZdeded ed!edededededed"edefd#d$ZdS )&
WebCrawlerNFcrawler_strategyalways_by_pass_cacheverbosec                 C   sh   |pt |d| _|| _tjtdt d| _	tj
| j	dd tj
| j	 ddd t  d| _d S )N)r   CRAWL4_AI_BASE_DIRECTORYz	.crawl4aiT)exist_okz/cacheF)LocalSeleniumCrawlerStrategyr   r   ospathjoingetenvr   homecrawl4ai_foldermakedirsr   ready)selfr   r   r    r#   Q/var/www/Befach/backend/venv/lib/python3.10/site-packages/crawl4ai/web_crawler.py__init__   s   
zWebCrawler.__init__c                 C   s0   t d | jddt ddd d| _t d d S )Nu(   [LOG] 🌤️  Warming up the WebCrawlerzhttps://google.com/   F)urlword_count_thresholdextraction_strategybypass_cacher   Tu'   [LOG] 🌞 WebCrawler is ready to crawl)printrunNoExtractionStrategyr!   r"   r#   r#   r$   warmup   s   zWebCrawler.warmupT	url_modelprovider	api_tokenextract_blocks_flagcss_selector
screenshotuse_cached_htmlr)   chunking_strategyreturnc                 K   s*   | j |j||	p	t |
f|j||d|S )N)r*   r4   r5   )r,   r'   r-   forced)r"   r0   r1   r2   r3   r(   r4   r5   r6   r)   r7   kwargsr#   r#   r$   
fetch_page*   s   zWebCrawler.fetch_page
url_modelsc                    s   |	pt  }	 fdd}t P}t|j|||gt| |gt| |gt| |gt| |gt| |gt| |gt| |	gt| |
gt| g|gt| R  }W d    |S 1 saw   Y  |S )Nc                    s    j | g|R i |S )N)r;   )r0   argsr:   r.   r#   r$   fetch_page_wrapperS   s   z2WebCrawler.fetch_pages.<locals>.fetch_page_wrapper)r-   r   listmaplen)r"   r<   r1   r2   r3   r(   r6   r4   r5   r)   r7   r:   r>   executorresultsr#   r.   r$   fetch_pagesD   s0   

zWebCrawler.fetch_pagesr'   r*   
user_agentc
                 K   s  z|pt  }|	|_t|tstdt|tstdt|t}d }d }d }|s/| js/t	|}|

ddr;| js;W d S |rSt|d }t|d }|rS|d }|sSd }|rW|s|r_| j| t }t| jj|fi |
}t }|	rtd| d	t| d
|| dd |r| j }| j|||||||||	t|f
i |
}t||_|W S  ty } z$t|dst||_td| d|j  t|dd|jdW  Y d }~S d }~ww )NzUnsupported extraction strategyzUnsupported chunking strategyr/   Tr      	   u   [LOG] 🚀 Crawling done for z, success: , time taken: .2f secondsmsgu   [ERROR] 🚫 Failed to crawl z	, error:  F)r'   htmlsuccesserror_message)r-   r   
isinstanceExtractionStrategy
ValueErrorChunkingStrategymaxMIN_WORD_THRESHOLDr   r	   getr!   sanitize_input_encoder   update_user_agenttimecrawlr+   booltake_screenshotprocess_htmlrN   	ExceptionhasattrstrrK   r   )r"   r'   r(   r)   r7   r*   r4   r5   rE   r   r:   cachedscreenshot_dataextracted_contentrM   t1t2crawl_resulter#   r#   r$   r,   j   sT   




&
(


zWebCrawler.runrM   rc   r(   	is_cachedc                 K   s  t   }zFt   }t }dd | D }|j||f|||dd|dtd|}|	r>td| dt   | d	d
 |d u rItd| W n ty\ } ztt	|d }~ww t
|dd}t
|dd}|dg }|dg }|di }|d u r|	rtd| d|j  ||}|||}tj|dt	dd}|	rtd| dt   | d	d |sd n|}|
st|||||dt|t|t||d
 t||t|||||||dddS )Nc                 S   s   i | ]\}}|d vr||qS ))	only_text$image_description_min_word_thresholdr#   ).0kvr#   r#   r$   
<dictcomp>   s    z+WebCrawler.process_html.<locals>.<dictcomp>ri   Frj   )r(   r4   ri   rj   u!   [LOG] 🚀 Content extracted for z, success: True, time taken: rI   rJ   z,Failed to extract content from the website: cleaned_htmlrL   markdownmedialinksmetadatau*   [LOG] 🔥 Extracting semantic blocks for z, Strategy: rF   )indentdefaultensure_asciiu   [LOG] 🚀 Extraction done for rH   z	 seconds.T)r5   )r'   rM   ro   rp   rq   rr   rs   r5   rc   rN   rO   )rY   r   itemsscraprV   $IMAGE_DESCRIPTION_MIN_WORD_THRESHOLDr+   rR   InvalidCSSSelectorErrorr`   rW   namechunkr,   jsondumpsr
   r   format_html)r"   r'   rM   rc   r(   r)   r7   r4   r5   r   rh   r:   trd   scrapping_strategyextra_paramsresultrg   ro   rp   rq   rr   rs   sectionsr#   r#   r$   r]      s   
	 
 zWebCrawler.process_html)NFF)__name__
__module____qualname__CrawlerStrategyr[   r%   r/   DEFAULT_PROVIDERrU   RegexChunkingr   r`   rQ   rS   r   r;   r   rD   r,   intr]   r#   r#   r#   r$   r      s    		

	

)	
=	
r   )r   rY   environpathlibr   modelsr   r   databaser   r	   r
   r   r   utilsr7   r)   r   typingr   concurrent.futuresr   content_scraping_strategyr   configwarningsr}   filterwarningsr   r#   r#   r#   r$   <module>   s"    
