o
    hQ                     @   s*  d dl Z d dlZd dlmZ d dlZd dlZd dlmZmZm	Z	 d dl
mZ d dlZd dlZddlmZmZ ddlmZmZ d dlZd dlZddlmZ dd	lmZ dd
lmZ ddlmZmZ ejejd e e!Z"e j#$e %de& d Z'Z(e j)e(dd e j#$e'dZ(G dd dZ*e* Z+dS )    N)Path)OptionalTupleDict)asynccontextmanager   )ensure_content_dirsgenerate_content_hash)CrawlResultMarkdownGenerationResult)NEED_MIGRATION)VersionManager)AsyncLogger)get_error_contextcreate_box_message)levelCRAWL4_AI_BASE_DIRECTORY	.crawl4aiTexist_okzcrawl4ai.dbc                   @   s   e Zd Zd+dedefddZdd Zd	d
 Zedd Zdd Z	dd Z
dd ZdefddZdedee fddZdefddZdefddZdd  Zd!d" Zd#ed$edefd%d&Zd'ed$edee fd(d)Zd*S ),AsyncDatabaseManager
      	pool_sizemax_retriesc                 C   sx   t | _ttjt | _|| _|| _i | _	t
 | _t
 | _t
|| _d| _t | _ttjtddddd| _d S )NFr   zcrawler_db.logr   )log_fileverbose	tag_width)DB_PATHdb_pathr   ospathdirnamecontent_pathsr   r   connection_poolasyncioLock	pool_lock	init_lock	Semaphoreconnection_semaphore_initializedr   version_managerr   joinbase_directorylogger)selfr   r    r1   T/var/www/Befach/backend/venv/lib/python3.10/site-packages/crawl4ai/async_database.py__init__   s   

zAsyncDatabaseManager.__init__c              
      s  z| j jddd tjtj| jdd | j }| 	 I dH  t
j| jdd4 I dH 7}|d	4 I dH }| I dH }|sFtd
W d  I dH  n1 I dH sVw   Y  W d  I dH  n1 I dH skw   Y  |r| j jddd |  I dH  ddlm} | I dH  | j  | j jddd W dS | j jddd W dS  ty } z| j jdddt|id | j jddd  d}~ww )z+Initialize the database and connection poolzInitializing databaseINIT)tagTr   N      >@timeoutzISELECT name FROM sqlite_master WHERE type='table' AND name='crawled_data'z"crawled_data table was not createdz%New version detected, running updatesr   )run_migrationz%Version update completed successfullyCOMPLETEz.Database initialization completed successfullyz&Database initialization error: {error}ERRORerrormessager5   paramsz)Database will be initialized on first user>   r5   )r/   infor    makedirsr!   r"   r   r,   needs_updateainit_db	aiosqliteconnectexecutefetchone	Exceptionupdate_db_schema
migrationsr9   update_versionsuccessr<   str)r0   rC   dbcursorresultr9   er1   r1   r2   
initialize,   sJ   

*(	

zAsyncDatabaseManager.initializec              	      sj   | j 4 I dH   | j D ]	}| I dH  q| j  W d  I dH  dS 1 I dH s.w   Y  dS )z&Cleanup connections when shutting downN)r'   r$   valuescloseclear)r0   connr1   r1   r2   cleanup\   s   .zAsyncDatabaseManager.cleanupc                 C  s  | j s\| j4 I dH F | j sGz|  I dH  d| _ W n+ tyF } zddl}t| }| jjdddt	||d |d dd	  d}~ww W d  I dH  n1 I dH sWw   Y  | j
 I dH  tt }zFz| j4 I dH  || jvrzbtj| jd
dI dH }|dI dH  |dI dH  |d4 I dH ,}| I dH }dd |D }h d}	|	t| }
|
rtd|
 W d  I dH  n1 I dH sw   Y  || j|< W n> ty } z1ddl}t| }d|d  d|d  d|d  dt	| d|d  
}| jjt|ddd  d}~ww W d  I dH  n1 I dH s-w   Y  | j| V  W n> tyw } z1ddl}t| }d|d  d|d  d|d  dt	| d|d  
}| jjt|ddd  d}~ww W | j4 I dH  || jv r| j|  I dH  | j|= W d  I dH  n1 I dH sw   Y  | j
  dS | j4 I dH  || jv r| j|  I dH  | j|= W d  I dH  n1 I dH sw   Y  | j
  w )z4Connection pool manager with enhanced error handlingNTr   zSDatabase initialization failed:
{error}

Context:
{context}

Traceback:
{traceback}r;   code_contextfull_traceback)r<   context	tracebackr>   r5   force_verboser?   r6   r7   zPRAGMA journal_mode = WALzPRAGMA busy_timeout = 5000PRAGMA table_info(crawled_data)c                 S      g | ]}|d  qS r   r1   ).0colr1   r1   r2   
<listcomp>       z7AsyncDatabaseManager.get_connection.<locals>.<listcomp>>   urlhtmllinksmediarM   markdownmetadata
screenshotcleaned_htmldownloaded_filesresponse_headersextracted_contentzDatabase missing columns: z.Unexpected error in db get_connection at line line_noz in functionz (filenamez
):
Error: z

Code context:
r<   )type)r>   )r+   r(   rS   rI   sysr   exc_infor/   r<   rN   r*   acquireidr%   current_taskr'   r$   rE   rF   r   rG   fetchallset
ValueErrorr   rU   release)r0   rR   ru   error_contexttask_idrW   rP   columnscolumn_namesexpected_columnsmissing_columnserror_messager1   r1   r2   get_connectionc   s   

((
*'

*
*z#AsyncDatabaseManager.get_connectionc                    s   t | jD ]o}z7|  4 I dH "}||g|R  I dH }| I dH  |W  d  I dH  W   S 1 I dH s9w   Y  W q tyu } z*|| jd kr_| jjddd| jt|dd  t	d|d  I dH  W Y d}~qd}~ww dS )z,Execute database operations with retry logicNr   z2Operation failed after {retries} attempts: {error}r;   T)retriesr<   r]   )
ranger   r   commitrI   r/   r<   rN   r%   sleep)r0   	operationargsattemptrO   rQ   rR   r1   r1   r2   execute_with_retry   s.   6	$z'AsyncDatabaseManager.execute_with_retryc              	      sj   t j| jdd4 I dH }|dI dH  | I dH  W d  I dH  dS 1 I dH s.w   Y  dS )zInitialize database schemar6   r7   Na{  
                CREATE TABLE IF NOT EXISTS crawled_data (
                    url TEXT PRIMARY KEY,
                    html TEXT,
                    cleaned_html TEXT,
                    markdown TEXT,
                    extracted_content TEXT,
                    success BOOLEAN,
                    media TEXT DEFAULT "{}",
                    links TEXT DEFAULT "{}",
                    metadata TEXT DEFAULT "{}",
                    screenshot TEXT DEFAULT "",
                    response_headers TEXT DEFAULT "{}",
                    downloaded_files TEXT DEFAULT "{}"  -- New column added
                )
            )rE   rF   r   rG   r   )r0   rO   r1   r1   r2   rD      s
   .zAsyncDatabaseManager.ainit_dbc              	      s   t j| jdd4 I dH ?}|dI dH }| I dH }dd |D }g d}|D ]}||vr9| ||I dH  q*| I dH  W d  I dH  dS 1 I dH sRw   Y  dS )z Update database schema if neededr6   r7   Nr_   c                 S   r`   ra   r1   )rb   columnr1   r1   r2   rd      re   z9AsyncDatabaseManager.update_db_schema.<locals>.<listcomp>)ri   rh   rk   rl   ro   rn   )rE   rF   r   rG   rz   aalter_db_add_columnr   )r0   rO   rP   r   r   new_columnsr   r1   r1   r2   rJ      s   .z%AsyncDatabaseManager.update_db_schema
new_columnc                    sV   |dkr| d| dI dH  n| d| dI dH  | jjddd|id	 dS )
zAdd new column to the databasero   z$ALTER TABLE crawled_data ADD COLUMN z TEXT DEFAULT "{}"Nz TEXT DEFAULT ""z'Added column '{column}' to the databaser4   r   r=   )rG   r/   rA   )r0   r   rO   r1   r1   r2   r      s   
z)AsyncDatabaseManager.aalter_db_add_columnrf   returnc              
      sf    fdd}z	  |I dH W S  ty2 } z jjddddt|id W Y d}~dS d}~ww )	z'Retrieve cached URL data as CrawlResultc              
      s  |  df4 I d H }| I d H }|s"	 W d   I d H  d S dd |jD }tt||}|d |d |d |d |d |d d	}| D ]\}}|re||d
d I d H }|pad||< qJd||< qJg d}	|	D ]!}z|| r~t	|| ni ||< W qp tj
y   i ||< Y qpw t|d tr|d |d< |d dr|d d |d< z|d rt	|d ng |d< W n tj
y   g |d< Y nw tj   fdd| D }
tdi |
W  d   I d H  S 1 I d H sw   Y  d S )Nz(SELECT * FROM crawled_data WHERE url = ?c                 S   r`   )r   r1   )rb   descriptionr1   r1   r2   rd     re   zFAsyncDatabaseManager.aget_cached_url.<locals>._get.<locals>.<listcomp>rg   rm   rj   rp   rl   )rg   rm   rj   rp   rl   screenshots_r    )ri   rh   rk   ro   rj   markdown_v2raw_markdownrn   c                    s   i | ]\}}| v r||qS r1   r1   )rb   kvvalid_fieldsr1   r2   
<dictcomp>>  s    zFAsyncDatabaseManager.aget_cached_url.<locals>._get.<locals>.<dictcomp>r1   )rG   rH   r   dictzipitems_load_contentsplitjsonloadsJSONDecodeError
isinstancer   getr
   __annotations__keys)rO   rP   rowr   row_dictcontent_fieldsfield
hash_valuecontentjson_fieldsfiltered_dictr0   rf   r   r2   _get  s\   	

""
0z2AsyncDatabaseManager.aget_cached_url.<locals>._getNz$Error retrieving cached URL: {error}r;   Tr<   r]   r   rI   r/   r<   rN   )r0   rf   r   rR   r1   r   r2   aget_cached_url  s   :
z$AsyncDatabaseManager.aget_cached_urlrQ   c           	   
      s  j dfjp	ddfdjpddfjpddfd}z?tjtr,j df|d< n-td	r;j	 df|d< ntjt
rPtjd
}| df|d< n	t  df|d< W n( ty } z| jjdt
| dd t  df|d< W Y d}~nd}~ww i  | D ]\}\}}| ||I dH  |< q fdd}z| |I dH  W dS  ty } z| jjddddt
|id W Y d}~dS d}~ww )zCache CrawlResult datarg   r   cleanedN	extractedr   )rg   rm   rj   rp   rl   rj   r   )r   z#Error processing markdown content: WARNINGr@   c                    s|   |  dj d  d  d  d jtjtjtjp$i  d tjp.i tj	p5g fI d H  d S )Na  
                INSERT INTO crawled_data (
                    url, html, cleaned_html, markdown,
                    extracted_content, success, media, links, metadata,
                    screenshot, response_headers, downloaded_files
                )
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                ON CONFLICT(url) DO UPDATE SET
                    html = excluded.html,
                    cleaned_html = excluded.cleaned_html,
                    markdown = excluded.markdown,
                    extracted_content = excluded.extracted_content,
                    success = excluded.success,
                    media = excluded.media,
                    links = excluded.links,
                    metadata = excluded.metadata,
                    screenshot = excluded.screenshot,
                    response_headers = excluded.response_headers,
                    downloaded_files = excluded.downloaded_files
            rg   rm   rj   rp   rl   )
rG   rf   rM   r   dumpsri   rh   rk   ro   rn   rO   content_hashesrQ   r1   r2   _cachen  s    

z/AsyncDatabaseManager.acache_url.<locals>._cachezError caching URL: {error}r;   Tr<   r]   )rg   rm   rp   rl   r   rj   r   model_dump_jsonhasattrr   rN   rI   r/   warningr   _store_contentr   r<   )	r0   rQ   content_mapmarkdown_resultrR   r   r   content_typer   r1   r   r2   
acache_urlM  sP   
#
zAsyncDatabaseManager.acache_urlc              
      s`   dd }z	|  |I dH W S  ty/ } z| jjddddt|id W Y d}~d	S d}~ww )
zGet total number of cached URLsc              	      sb   |  d4 I d H }| I d H }|r|d ndW  d   I d H  S 1 I d H s*w   Y  d S )Nz!SELECT COUNT(*) FROM crawled_datar   )rG   rH   )rO   rP   rQ   r1   r1   r2   _count  s
   0z5AsyncDatabaseManager.aget_total_count.<locals>._countNz"Error getting total count: {error}r;   Tr<   r]   r   r   )r0   r   rR   r1   r1   r2   aget_total_count  s   
z%AsyncDatabaseManager.aget_total_countc              
      d   dd }z|  |I dH  W dS  ty1 } z| jjddddt|id W Y d}~dS d}~ww )	z Clear all data from the databasec                       |  dI d H  d S )NzDELETE FROM crawled_datarG   r   r1   r1   r2   _clear     z.AsyncDatabaseManager.aclear_db.<locals>._clearNz Error clearing database: {error}r;   Tr<   r]   r   )r0   r   rR   r1   r1   r2   	aclear_db     
zAsyncDatabaseManager.aclear_dbc              
      r   )	zDrop the entire tablec                    r   )Nz!DROP TABLE IF EXISTS crawled_datar   r   r1   r1   r2   _flush  r   z.AsyncDatabaseManager.aflush_db.<locals>._flushNz Error flushing database: {error}r;   Tr<   r]   r   )r0   r   rR   r1   r1   r2   	aflush_db  r   zAsyncDatabaseManager.aflush_dbr   r   c              	      s   |sdS t |}tj| j| |}tj|sDtj|ddd4 I dH }||I dH  W d  I dH  |S 1 I dH s?w   Y  |S )z+Store content in filesystem and return hashr   wutf-8encodingN)	r	   r    r!   r-   r#   existsaiofilesopenwrite)r0   r   r   content_hash	file_pathfr1   r1   r2   r     s   z#AsyncDatabaseManager._store_contentr   c              	      s   |sdS t j| j| |}z-tj|ddd4 I dH }| I dH W  d  I dH  W S 1 I dH s5w   Y  W dS    | jjdddd|id	 Y dS )
z$Load content from filesystem by hashNrr   r   z#Failed to load content: {file_path}r;   Tr   r]   )	r    r!   r-   r#   r   r   readr/   r<   )r0   r   r   r   r   r1   r1   r2   r     s    4z"AsyncDatabaseManager._load_contentN)r   r   )__name__
__module____qualname__intr3   rS   rX   r   r   r   rD   rJ   rN   r   r   r
   r   r   r   r   r   r   r   r1   r1   r1   r2   r      s"    0
YGOr   ),r    ru   pathlibr   rE   r%   typingr   r   r   
contextlibr   loggingr   utilsr   r	   modelsr
   r   xxhashr   configr   r,   r   async_loggerr   r   r   basicConfigINFO	getLoggerr   r/   r!   r-   getenvhomer.   r   rB   r   async_db_managerr1   r1   r1   r2   <module>   s4    
   
Y