o
    h8                     @   sv  d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
 d dlmZ d dlmZmZmZ d dlZd dlZd dlmZ ddlmZmZ dd	lmZ dd
lT dd
lT dd
lT ddlmZmZm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1m2Z2 ddl3m4Z4m5Z5m6Z6m7Z7m8Z8 d dl9m:Z: d dl;Z;ddl<m<Z= G dd dZ>dS )    N)Enum)initForeBackStyle)Path)OptionalListUnion)asynccontextmanager   )CrawlResultMarkdownGenerationResult)async_db_manager)*)AsyncCrawlerStrategyAsyncPlaywrightCrawlerStrategyAsyncCrawlResponse)	CacheModeCacheContext_legacy_to_cache_mode)DefaultMarkdownGeneratorMarkdownGenerationStrategy)WebScrapingStrategy)AsyncLogger)BrowserConfigCrawlerRunConfig)MIN_WORD_THRESHOLD$IMAGE_DESCRIPTION_MIN_WORD_THRESHOLDURL_LOG_SHORTEN_LENGTH)sanitize_input_encodeInvalidCSSSelectorErrorformat_htmlfast_format_htmlcreate_box_message)urlparse)__version__c                   @   s  e Zd ZdZi Zddddeede	 dfde
e de
e dede
e d	ed
efddZdd Zdd Zdd Zdd Zdd Zedd Zdede dddddddddddfdede
e dededede
e ded ed!ed"ed#ed$ed%ed&ed'efd(d)Zded*ed+eded$ed,ed-ed'efd.d/Zdede ddddddddfd0e e de
e dededede
e ded#ed$ed%ed&ed'e e fd1d2Z!d3d4 Z"d5d6 Z#d7d8 Z$dS )9AsyncWebCrawleraB
  
    Asynchronous web crawler with flexible caching capabilities.
    
    There are two ways to use the crawler:

    1. Using context manager (recommended for simple cases):
        ```python
        async with AsyncWebCrawler() as crawler:
            result = await crawler.arun(url="https://example.com")
        ```

    2. Using explicit lifecycle management (recommended for long-running applications):
        ```python
        crawler = AsyncWebCrawler()
        await crawler.start()
        
        # Use the crawler multiple times
        result1 = await crawler.arun(url="https://example.com")
        result2 = await crawler.arun(url="https://another.com")
        
        await crawler.close()
        ```
    
    Migration Guide:
    Old way (deprecated):
        crawler = AsyncWebCrawler(always_by_pass_cache=True, browser_type="chromium", headless=True)
    
    New way (recommended):
        browser_config = BrowserConfig(browser_type="chromium", headless=True)
        crawler = AsyncWebCrawler(config=browser_config)
    
    
    Attributes:
        browser_config (BrowserConfig): Configuration object for browser settings.
        crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages.
        logger (AsyncLogger): Logger instance for recording events and errors.
        always_bypass_cache (bool): Whether to always bypass cache.
        crawl4ai_folder (str): Directory for storing cache.
        base_directory (str): Base directory for storing cache.
        ready (bool): Whether the crawler is ready for use.
        
        Methods:
            start(): Start the crawler explicitly without using context manager.
            close(): Close the crawler explicitly without using context manager.
            arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
            awarmup(): Perform warmup sequence.
            arun_many(): Run the crawler for multiple sources.
            aprocess_html(): Process HTML content.
    
    Typical Usage:
        async with AsyncWebCrawler() as crawler:
            result = await crawler.arun(url="https://example.com")
            print(result.markdown)
            
        Using configuration:
        browser_config = BrowserConfig(browser_type="chromium", headless=True)
        async with AsyncWebCrawler(config=browser_config) as crawler:
            crawler_config = CrawlerRunConfig(
                cache_mode=CacheMode.BYPASS                
            )
            result = await crawler.arun(url="https://example.com", config=crawler_config)
            print(result.markdown)
    NFCRAWL4_AI_BASE_DIRECTORYcrawler_strategyconfigalways_bypass_cachealways_by_pass_cachebase_directorythread_safec           
         s(  |}|durt  fdddD r| jjddd nt }|| _ttj	|dd	| jj
d
d| _dd   D }	|pGtd|| jd|	| _| jjsR| j| j_|durh ddrdtjdtdd || _n|| _|rqt nd| _tj	|d| _tj| jdd tj| j ddd d| _dS )ax  
        Initialize the AsyncWebCrawler.

        Args:
            crawler_strategy: Strategy for crawling web pages. If None, will create AsyncPlaywrightCrawlerStrategy
            config: Configuration object for browser settings. If None, will be created from kwargs
            always_bypass_cache: Whether to always bypass cache (new parameter)
            always_by_pass_cache: Deprecated, use always_bypass_cache instead
            base_directory: Base directory for storing cache
            thread_safe: Whether to use thread-safe operations
            **kwargs: Additional arguments for backwards compatibility
        Nc                 3   s    | ]}| v V  qd S N ).0kkwargsr0   V/var/www/Befach/backend/venv/lib/python3.10/site-packages/crawl4ai/async_webcrawler.py	<genexpr>       z+AsyncWebCrawler.__init__.<locals>.<genexpr>)browser_typeheadlessviewport_widthviewport_heightz`Both browser_config and legacy browser parameters provided. browser_config will take precedence.WARNINGmessagetagz	.crawl4aizcrawler.log
   )log_fileverbose	tag_widthc                 S   s   i | ]\}}|d v r||qS ))browser_congigloggerr0   r1   r2   vr0   r0   r5   
<dictcomp>   s    z,AsyncWebCrawler.__init__.<locals>.<dictcomp>)browser_configrE   warningTz'always_by_pass_cache' is deprecated and will be removed in version 0.5.0. Use 'always_bypass_cache' instead. Pass warning=False to suppress this warning.   
stacklevel)exist_okz/cacheFr0   )anyrE   rJ   r   from_kwargsrI   r   ospathjoinrB   itemsr   r)   getwarningswarnDeprecationWarningr+   asyncioLock_lockcrawl4ai_foldermakedirsready)
selfr)   r*   r+   r,   r-   r.   r4   rI   paramsr0   r3   r5   __init__k   sP   


zAsyncWebCrawler.__init__c                    s$   | j  I dH  |  I dH  | S )a  
        Start the crawler explicitly without using context manager.
        This is equivalent to using 'async with' but gives more control over the lifecycle.
        
        This method will:
        1. Initialize the browser and context
        2. Perform warmup sequence
        3. Return the crawler instance for method chaining
        
        Returns:
            AsyncWebCrawler: The initialized crawler instance
        N)r)   
__aenter__awarmupr_   r0   r0   r5   start   s   zAsyncWebCrawler.startc                    s   | j dddI dH  dS )a  
        Close the crawler explicitly without using context manager.
        This should be called when you're done with the crawler if you used start().
        
        This method will:
        1. Clean up browser resources
        2. Close any open pages and contexts
        N)r)   	__aexit__rd   r0   r0   r5   close   s   	zAsyncWebCrawler.closec                    s   |   I d H S r/   )re   rd   r0   r0   r5   rb      s   zAsyncWebCrawler.__aenter__c                    s   |   I d H  d S r/   )rg   )r_   exc_typeexc_valexc_tbr0   r0   r5   rf      s   zAsyncWebCrawler.__aexit__c                    s"   | j jdt dd d| _dS )z
        Initialize the crawler with warm-up sequence.
        
        This method:
        1. Logs initialization info
        2. Sets up browser configuration
        3. Marks the crawler as ready
        z	Crawl4AI INIT)r?   TN)rE   infocrawl4ai_versionr^   rd   r0   r0   r5   rc      s   	
zAsyncWebCrawler.awarmupc                 C  s   dV  dS )u   异步空上下文管理器Nr0   rd   r0   r0   r5   nullcontext   s   
zAsyncWebCrawler.nullcontextTurlextraction_strategychunking_strategycontent_filter
cache_modebypass_cachedisable_cacheno_cache_readno_cache_writecss_selector
screenshotpdf
user_agentreturnc           "         s"  |}t |tr
|std| jp|  4 I dH  z|dur$|}n|||||||	|
|||||d|}t|}t||	|
|gr_|ddrQt	j
dtdd |jdu r_t|	||
|d	|_|jdu rhtj|_t||j| j}d}d}d}d}d}t }| rt|I dH }|rt|j}t|jpd
}|r|dkrdn|}|j}|j}|jr|r|jr|sd}| jj|jt|t | dd |r|s{t }|r| j !| | j j"||dI dH }t|j}|j}|j#}t }| jj|jt||| dd | j$d(|||||||j%|&drdndd|I dH }|j'|_'|j(|_(|j)|_)|j*|_*t||_+t,|dd|_-| jj+dd|j|j+t | ddd|j+rSt.j/nt.j0t.j1dd |2 rnt|snt3|I dH  |W W  d  I dH  S | jj+dd|jdt | dddt.j/t.j1dd t||_+t,|dd|_-|W W  d  I dH  S  t4y } zFt5t67 } d| d  d| d  d| d  d t| d!| d"  
}!| jj8|t9|!d#d$d%d& t:|d
d|!d'W  Y d}~W  d  I dH  S d}~ww 1 I dH s
w   Y  dS ))a  
            Runs the crawler for a single source: URL (web, local file, or raw HTML).

            Migration Guide:
            Old way (deprecated):
                result = await crawler.arun(
                    url="https://example.com",
                    word_count_threshold=200,
                    screenshot=True,
                    ...
                )

            New way (recommended):
                config = CrawlerRunConfig(
                    word_count_threshold=200,
                    screenshot=True,
                    ...
                )
                result = await crawler.arun(url="https://example.com", crawler_config=config)

            Args:
                url: The URL to crawl (http://, https://, file://, or raw:)
                crawler_config: Configuration object controlling crawl behavior
                [other parameters maintained for backwards compatibility]

            Returns:
                CrawlResult: The result of crawling and processing
            z4Invalid URL, make sure the URL is a non-empty stringN)word_count_thresholdrp   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   rB   rJ   TztCache control boolean flags are deprecated and will be removed in version 0.5.0. Use 'cache_mode' parameter instead.rK   rL   )ru   rt   rv   rw    z[]FETCH)ro   successtimingr?   )r*   zraw:F)ro   htmlextracted_contentr*   ry   pdf_datarB   is_raw_html
session_idz1{url:.50}... | Status: {status} | Total: {timing}COMPLETE.2fs)ro   statusr   )r   r   r>   r?   r`   colorsz'Unexpected error in _crawl_web at line line_noz in functionz (filenamez
):
Error: z

Code context:
code_contexterror)typeERROR)ro   r   r?   )ro   r   r   error_messager0   );
isinstancestr
ValueErrorr[   rn   r   rP   rO   rU   rV   rW   rX   rs   r   r   ENABLEDr   r+   timeperf_countershould_readr   aget_cached_urlr    r   r   ry   rz   rE   
url_statusdisplay_urlboolr)   update_user_agentcrawlr   aprocess_htmlrB   
startswithstatus_coderesponse_headersdownloaded_filesssl_certificater   getattrr   r   GREENREDYELLOWshould_write
acache_url	Exceptionget_error_contextsysexc_infoerror_statusr$   r   )"r_   ro   r*   r}   rp   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   rB   r4   crawler_configconfig_kwargscache_contextasync_responsecached_resultscreenshot_datar   r   
start_timer   t1t2crawl_resulteerror_contextr   r0   r0   r5   arun   sH  3	






	
  0
  B	
  B zAsyncWebCrawler.arunr   r   r   rB   c                    s  zD| dds
|nd}	t }
t| jd}dd |  D    fdd| D  |j||fi  }|du rDt	d	| W n) t
yW } zt	t|d}~w tyn } zt	d	| d
t| d}~ww t| dd}t| dd}t| dd}| dg }| dg }| di }|jpt }|j||d}|}t|j}| jjdd|	tt |
 d dd |du r,|jr,|jr,t|jts,t }
|jj}|dkr|js| jjddd|	id d}|||jd ||}|dkrt n|j}||}|j||}tj|dtdd}| jjd d|	t |
 dd |s1dn|}|s8dn|}|j rBt!|}t"|||||||||||||d!dd"S )#a  
            Process HTML content using the provided configuration.
            
            Args:
                url: The URL being processed
                html: Raw HTML content
                extracted_content: Previously extracted content (if any)
                config: Configuration object controlling processing behavior
                screenshot: Screenshot data (if any)
                pdf_data: PDF data (if any)
                verbose: Whether to enable verbose logging
                **kwargs: Additional parameters for backwards compatibility
            
            Returns:
                CrawlResult: Processed result containing extracted and formatted content
            r   FzRaw HTML)rE   c                 S   s   i | ]\}}|d vr||qS ))ro   r0   rF   r0   r0   r5   rH   $  s    z1AsyncWebCrawler.aprocess_html.<locals>.<dictcomp>c                    s"   i | ]\}}|   vr||qS r0   )keysrF   r`   r0   r5   rH   &     " Nz:Process HTML, Failed to extract content from the website: z	, error: cleaned_htmlr~   fit_markdownfit_htmlmedialinksmetadata)r   base_urlz)Processed {url:.50}... | Time: {timing}msSCRAPEi  )ro   r   r>   r?   r`   zGFit markdown requested but not available. Falling back to raw markdown.EXTRACTro   markdown)r   r   r   r      )indentdefaultensure_asciiz,Completed for {url:.50}... | Time: {timing}sT)ro   r   r   markdown_v2r   r   r   r   r   r   ry   rz   r   r   r   )#rU   r   r   r   rE   to_dictrT   updatescrapr   r!   r   r   r    markdown_generatorr   generate_markdownraw_markdownrl   intrp   rq   r   NoExtractionStrategyinput_formatr   rJ   IdentityChunkingchunkrunjsondumps	prettiifyr#   r   )r_   ro   r   r   r*   ry   r   rB   r4   _urlr   scrapping_strategyresultr   r   r   r   r   r   r   r   markdown_resultr   r   content_formatcontentchunkingsectionsr   r0   r   r5   r     s   






zAsyncWebCrawler.aprocess_htmlurlsc                    sV   }|dur#t dd ||||||	|
|fD r jjddd | n|||||||	|
||d
|}t| |rP|dd	rGtjd
tdd  j	du rPt
j _	 jpTd}t| fddjjdddt|id t }fdd|D }tj|dd	iI dH }t }jjddt||| ddddtjid dd |D S )a  
            Runs the crawler for multiple URLs concurrently.

            Migration Guide:
            Old way (deprecated):
                results = await crawler.arun_many(
                    urls,
                    word_count_threshold=200,
                    screenshot=True,
                    ...
                )
            
            New way (recommended):
                config = CrawlerRunConfig(
                    word_count_threshold=200,
                    screenshot=True,
                    ...
                )
                results = await crawler.arun_many(urls, crawler_config=config)

            Args:
                urls: List of URLs to crawl
                crawler_config: Configuration object controlling crawl behavior for all URLs
                [other parameters maintained for backwards compatibility]
            
            Returns:
                List[CrawlResult]: Results for each URL
            Nc                 s   s    | ]}|d uV  qd S r/   r0   )r1   paramr0   r0   r5   r6     r7   z,AsyncWebCrawler.arun_many.<locals>.<genexpr>zXBoth crawler_config and legacy parameters provided. crawler_config will take precedence.r<   r=   )
r}   rp   rq   rr   rs   rt   rx   ry   rz   rB   rJ   Tz'bypass_cache' is deprecated and will be removed in version 0.5.0. Use 'cache_mode=CacheMode.BYPASS' instead. Pass warning=False to suppress this warning.rK   rL      c              	      s   t | j}t }jjddd| id  j} j}|jv r;|j|  }||k r;|t	d| }t
|I d H  |j|< 4 I d H  j|  dI d H W  d   I d H  S 1 I d H sbw   Y  d S )NzStarted task for {url:.50}...PARALLELro   r   r   )r   r{   )r%   netlocr   rE   debug
mean_delay	max_range_domain_last_hitrandomuniformrY   sleepr   )ro   domaincurrent_timer   r   time_since_lastdelay)r*   r_   	semaphorer{   r0   r5   crawl_with_semaphore  s.   



0z7AsyncWebCrawler.arun_many.<locals>.crawl_with_semaphorez0Starting concurrent crawling for {count} URLs...rk   countr   c                    s   g | ]} |qS r0   r0   )r1   ro   )r   r0   r5   
<listcomp>$  s    z-AsyncWebCrawler.arun_many.<locals>.<listcomp>return_exceptionszEConcurrent crawling completed for {count} URLs | Total time: {timing}r   r   r   )r   r   r   r   c                 S   s"   g | ]}t |ts|nt|qS r0   )r   r   r   )r1   r   r0   r0   r5   r   5  r   )rO   rE   rJ   r   rP   rU   rV   rW   rX   rs   r   BYPASSsemaphore_countrY   	Semaphorerl   lenr   r   gatherr   r   r   )r_   r   r*   r}   rp   rq   rr   rs   rt   rx   ry   rz   r{   rB   r4   r   r   r   r   tasksresultsend_timer0   )r*   r   r_   r   r{   r5   	arun_many  sr   .





 
zAsyncWebCrawler.arun_manyc                       t  I dH  dS )zClear the cache database.N)r   cleanuprd   r0   r0   r5   aclear_cache7     zAsyncWebCrawler.aclear_cachec                    r  )zFlush the cache database.N)r   	aflush_dbrd   r0   r0   r5   aflush_cache;  r  zAsyncWebCrawler.aflush_cachec                    s   t  I dH S )z%Get the total number of cached items.N)r   aget_total_countrd   r0   r0   r5   aget_cache_size?  s   zAsyncWebCrawler.aget_cache_size)%__name__
__module____qualname____doc__r   r   rQ   getenvr   homer   r   r   r   ra   re   rg   rb   rf   rc   r   rn   r   RegexChunkingr   ExtractionStrategyChunkingStrategyRelevantContentFilterr   r   r   r   r	   r  r
  r  r  r0   r0   r0   r5   r'   )   s   ?
R
	
  

 %	

 r'   )?rQ   r   r   rV   enumr   coloramar   r   r   r   pathlibr   typingr   r	   r
   r   rY   
contextlibr   modelsr   r   async_databaser   rq   content_filter_strategyrp   async_crawler_strategyr   r   r   r   r   r   r   markdown_generation_strategyr   r   content_scraping_strategyr   async_loggerr   async_configsr   r   r*   r   r   r   utilsr    r!   r"   r#   r$   urllib.parser%   r   r&   rm   r'   r0   r0   r0   r5   <module>   s6    