o
    hP                     @   s  d dl Z d dlmZ d dlZd dlmZmZmZmZm	Z	 d dl
Z
d dlmZ d dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d
dlmZ d dlZd dlZd dlZd dlmZ d dl Z de_!dede"fddZ#G dd dZ$dS )    N)Path)DictListTupleOptionalAny)tqdm)	BM25Okapi)word_tokenize)	stopwords)WordNetLemmatizer)
completionbatch_completion   )AsyncLogger)fnmatchF	file_pathreturnc                    sb   t  }| d t fdddD ]}|| qW d   | S 1 s(w   Y  | S )z/Compute MD5 hash for the file's entire content.rbc                      s
     dS )Ni   )read fr   L/var/www/Befach/backend/venv/lib/python3.10/site-packages/crawl4ai/llmtxt.py<lambda>   s   
 z$_compute_file_hash.<locals>.<lambda>    N)hashlibmd5openiterupdate	hexdigest)r   hash_md5chunkr   r   r   _compute_file_hash   s   
r$   c                   @   s<  e Zd Z			d3dedee dededdf
d	d
Zdee ddfddZ	de
deeee
 f fddZdedefddZdededdfddZde
dee
 fddZd4defddZd4d5ddZd6ded eddfd!d"Zd7d$ee
 d%e
de
fd&d'Zd8d(e
d)ede
fd*d+Zd,ee d-ed.ee
 defd/d0Zd5d1d2ZdS )9AsyncLLMTextManagerN      docs_dirloggermax_concurrent_calls
batch_sizer   c                 C   s:   || _ || _|| _|| _d | _i | _g | _| j d | _d S )Nzbm25_index.pkl)r(   r)   r*   r+   
bm25_indexdocument_maptokenized_factsbm25_index_file)selfr(   r)   r*   r+   r   r   r   __init__!   s   zAsyncLLMTextManager.__init__	doc_batchc                    s  g }|D ]G}z t |ddd}||  W d   n1 s!w   Y  W q tyL } z| jd| dt|  |d W Y d}~qd}~ww d  fd	d
|D }ztd|dd}t||D ]\}}z^t	
d|jd jjt	j}	|	s| jd|  W qet	dd|	d }
|
r|d}t |ddd}||
 W d   n1 sw   Y  | jd|  n	| jd|  W qe ty } z| jd| dt|  W Y d}~qed}~ww W dS  ty	 } z| jdt|  W Y d}~dS d}~ww )z(Process a batch of documents in parallelrutf-8encodingNError reading :  a  Given a documentation file, generate a list of atomic facts where each fact:
1. Represents a single piece of knowledge
2. Contains variations in terminology for the same concept
3. References relevant code patterns if they exist
4. Is written in a way that would match natural language queries

Each fact should follow this format:
<main_concept>: <fact_statement> | <related_terms> | <code_reference>

Example Facts:
browser_config: Configure headless mode and browser type for AsyncWebCrawler | headless, browser_type, chromium, firefox | BrowserConfig(browser_type="chromium", headless=True)
redis_connection: Redis client connection requires host and port configuration | redis setup, redis client, connection params | Redis(host='localhost', port=6379, db=0)
pandas_filtering: Filter DataFrame rows using boolean conditions | dataframe filter, query, boolean indexing | df[df['column'] > 5]

Wrap your response in <index>...</index> tags.
c                    s&   g | ]}|rd   d| dgqS )userz*

Generate index for this documentation:

)rolecontentr   ).0r<   promptr   r   
<listcomp>N   s    z?AsyncLLMTextManager._process_document_batch.<locals>.<listcomp>z"anthropic/claude-3-5-sonnet-latest)modelmessages	logger_fnz<index>(.*?)</index>r   z(No <index>...</index> content found for z\n\s*\n
r   .q.mdwzCreated index file: z'No index content found in response for zError processing response for zError in batch completion: )r   appendr   	Exceptionr)   errorstrr   zipresearchchoicesmessager<   DOTALLwarningsubgroupstripwith_suffixwriteinfo)r0   r2   contentsr   r   emessages_list	responsesresponseindex_content_matchindex_content
index_filer   r>   r   _process_document_batch1   sn   

($z+AsyncLLMTextManager._process_document_batchlinec                 C   sV   d|vrdS dd | dD }t|dkrddt| fS |d }d	|vr)d
S dS )N|)FzMissing separator '|'c                 S      g | ]}|  qS r   rT   )r=   pr   r   r   r@   }       z;AsyncLLMTextManager._validate_fact_line.<locals>.<listcomp>r'   FzExpected 3 parts, got r   :)Fz!Missing ':' in concept definition)TN)splitlen)r0   ra   partsconcept_partr   r   r   _validate_fact_liney   s   z'AsyncLLMTextManager._validate_fact_line	fact_filec              
   C   s   | d}t|}| rqz0t|d}t|}W d   n1 s#w   Y  |d|kr2|W S | jd| d W n3 tj	yQ   | j
d| d Y n  typ } z| j
d	| d
t|  W Y d}~nd}~ww i |dS )z
        Load token cache from .q.tokens if present and matching file hash.
        Otherwise return a new structure with updated file-hash.
        	.q.tokensr3   Ncontent_hashzHash changed for z, reindex needed.zCorrupt token cache for z, rebuilding.zError reading cache for r8   factsro   )rU   r$   existsr   jsonloadgetr)   rW   JSONDecodeErrorrQ   rH   rJ   )r0   rm   
cache_filecurrent_hashr   cacherY   r   r   r   _load_or_create_token_cache   s"   
(
z/AsyncLLMTextManager._load_or_create_token_cachery   c                 C   sR   | d}t||d< t|d}t|| W d    d S 1 s"w   Y  d S )Nrn   ro   rF   )rU   r$   r   rs   dump)r0   rm   ry   rw   r   r   r   r   _save_token_cache   s
   
"z%AsyncLLMTextManager._save_token_cachetextc                    s   d|v rdd | dD n|g}tdd|d |d< t  ttdh d g }|D ](}d	|v rCd
|v rCtd|}|| t	|
 }| fdd|D  q.|S )Nrb   c                 S   rc   r   rd   r=   xr   r   r   r@      rf   z7AsyncLLMTextManager.preprocess_text.<locals>.<listcomp>z^(.*?):z\1r   english>   howwhywhatwhenwherewhich()z.[\w_]+(?=\()|[\w_]+(?==[\'"]{1}[\w_]+[\'"]{1})c                    s   g | ]}|vr  |qS r   )	lemmatize)r=   token
lemmatizer
stop_wordsr   r   r@      s
    )rh   rL   rR   r   setr   wordsfindallextendr
   lower)r0   r}   rj   tokenspartcode_tokensr   r   r   r   preprocess_text   s$   "
z#AsyncLLMTextManager.preprocess_textFc                 C   sp   |s6t j| jr6| jd t| jd}t|}W d   n1 s%w   Y  |d | _	|d | _
dS dS )zW
        Load existing BM25 index from disk, if present and clear_cache=False.
        z&Loading existing BM25 index from disk.r   Nr.   r,   TF)ospathrr   r/   r)   rW   r   picklert   r.   r,   )r0   clear_cacher   datar   r   r   maybe_load_bm25_index   s   

z)AsyncLLMTextManager.maybe_load_bm25_indexc                    sh  |r j d  j r j  t } j d  fddt j	D }g }g }g }g }|D ]K}|
d}	|s@|	 sF|| q3 |}
t|
d dks\|
dt|krb|| q3|
d  D ]\}}|| ||d	  | j|< qhq3|s|sو jd
dr j d dS  j d |rш j dt| d t| _| _t jd}t j jd| W d   dS 1 sw   Y  dS  j d dS  j t| d g }g }tt|dd}|D ]}i t|d}zt|ddd}|  }dd |dD }W d   n	1 s w   Y  |D ]6} |\}}|s=||||f q' |}|t   d|d |< || || | j|< q' !|| |" j#d d } j $d|j% d |d!d" W n! t&y } z j 'd#| d t(|  W Y d}~nd}~ww |)d$ qW d   n	1 sw   Y  |r j d%t| d& |D ]\}}} j | d | d'|dd(  d) q|| }|| } j dt| d* t| _| _t jd}t j jd| W d   n	1 sw   Y  |" j#d d } j d+|d!d" dS ),a  
        Checks for new or modified .q.md files by comparing file-hash.
        If none need reindexing and clear_cache is False, loads existing index if available.
        Otherwise, reindexes only changed/new files and merges or creates a new index.
        z0Clearing cache and rebuilding full search index.z/Checking which .q.md files need (re)indexing...c                    s    g | ]}| d r j| qS )rE   )endswithr(   r=   r   r0   r   r   r@      s     z:AsyncLLMTextManager.build_search_index.<locals>.<listcomp>rn   rq   r   ro   r   Fr   z<No new/changed .q.md files found. Using existing BM25 index.Nz9No existing BM25 index found. Building from cached facts.zBuilding BM25 index with z cached facts.wb)r,   r.   z+No facts found at all. Index remains empty.z( file(s) need reindexing. Parsing now...zIndexing changed files)totaldescrp   r3   r4   r5   c                 S   s   g | ]
}|  r|  qS r   rd   )r=   lr   r   r   r@   %  s    rD   )r   addedi   zMemory usage after r8   z.2fMBzError processing r   zFound z invalid fact lines:z
 in line: 2   z...z total facts (old + new).z*Search index updated. Final memory usage: )*r)   rW   r/   rr   unlinkpsutilProcessr   listdirr(   rU   rG   rz   ri   ru   r$   itemsr-   r   r	   r,   r.   r   r   r{   rQ   r   r   rT   rh   rl   r   timer|   memory_inforssdebugnamerH   rI   rJ   r    )r0   r   processq_filesexisting_factsexisting_tokensinvalid_linesneedSetqftoken_cache_filery   ra   
cache_datar   	new_facts
new_tokens	file_pbarfilefresh_cachef_objr<   linesis_validrI   r   	mem_usagerY   	all_facts
all_tokens	final_memr   r   r   build_search_index   s   




"





"(#*
z&AsyncLLMTextManager.build_search_indexforce_generate_factsclear_bm25_cachec                    s    j d  fddt jD }|s fdd|D }|s( j d n5tdt| jD ]+}||| j  } j d| j d  d	t| j d    |I d
H  q1 j d  j	|d d
S )a	  
        Generate index files for all documents in parallel batches
        
        Args:
            force_generate_facts (bool): If True, regenerate indexes even if they exist
            clear_bm25_cache (bool): If True, clear existing BM25 index cache
        z2Starting index generation for documentation files.c                    s6   g | ]   d rt fdddD sj  qS ).mdc                 3   s    | ]}  |V  qd S N)r   r~   r   r   r   	<genexpr>e  s    zFAsyncLLMTextManager.generate_index_files.<locals>.<listcomp>.<genexpr>)rE   .xs.md)r   anyr(   r=   r   r   r   r@   c  s    z<AsyncLLMTextManager.generate_index_files.<locals>.<listcomp>c                    s(   g | ]} j |jd d  s|qS )r   rE   )r(   r   replacerr   r   r   r   r   r@   j  s    z4All index files exist. Use force=True to regenerate.r   zProcessing batch r   /Nz:Index generation complete, building/updating search index.r   )
r)   rW   r   r   r(   rangeri   r+   r`   r   )r0   r   r   md_filesibatchr   r   r   generate_index_filesY  s"   


0z(AsyncLLMTextManager.generate_index_filesextendedsectionsmodec                    s  t  t| jd t  t| jd  }dd |D } r& fdd|D }g }t|dd dD ]1}|d	krT| j| d
 }| j| d }|t| rO|n| q0|t| j| d  q0g }	|D ]R}
z0t|
ddd}t|
j}|	d d| dd d|	   W d    n1 sw   Y  W qf t
y } z| jd|
 dt|  W Y d }~qfd }~ww |	rd|	S dS )Nz	[0-9]*.mdz[0-9]*.xs.mdc                 S   s0   h | ]}t |jd st |jdd qS )rE   .r   )r   r   r   rh   r   r   r   r   	<setcomp>  s    z/AsyncLLMTextManager.generate.<locals>.<setcomp>c                    s&   h | ] t  fd dD r qS )c                 3   s     | ]}|     v V  qd S r   )r   )r=   sectiondocr   r   r     s    z9AsyncLLMTextManager.generate.<locals>.<setcomp>.<genexpr>)r   r   r   r   r   r     s    
c                 S   s(   |  dd  rt|  dd S dS )N_r   i?B )rh   isdigitintr   r   r   r   r     s   ( z.AsyncLLMTextManager.generate.<locals>.<lambda>)key	condensedr   r   r3   r4   r5   ####################z
# rD   z

r7   r8   

---

r9   )globrJ   r(   sortedrG   rr   r   r   r   r   rH   r)   rI   join)r0   r   r   	all_files	base_docsfilesr   xs_fileregular_filer<   r   r   fnamerY   r   r   r   generate{  s4   
&(zAsyncLLMTextManager.generatequerytop_kc              	   C   s  | j sdS | |}| j |}t|}t|}|d|  }| j|||d}t| dd ddd | }	g }
|	D ]J\}}t	|
dd	}tj| j| rt| j| d
dd#}|dd }dd| dd| g}|
d| W d    n1 sw   Y  q;d|
S )Nz;No search index available. Call build_search_index() first.g      ?)
doc_scoresscore_thresholdquery_tokensc                 S   s,   | d d d | d d d  | d d  S )Nr   code_match_scoreg       @match_countg      ?total_scorer   r   r   r   r   r     s
   
z,AsyncLLMTextManager.search.<locals>.<lambda>T)r   reverserE   r   r3   r4   r5   r   r   z# r9   rD   r   )r,   r   
get_scoresnpmeanstd_aggregate_search_scoresr   r   rJ   r   r   r   rr   r(   r   rh   r   rG   r   )r0   r   r   r   r   
mean_score	std_scorer   	file_dataranked_filesresultsr   r   main_docr   only_file_namer<   r   r   r   rM     sF   




zAsyncLLMTextManager.searchr   r   r   c                 C   s   i }t |D ]w\}}||krq| j| }| j| }||vr&dddg d||< d|v r/|dn|g}	d}
t|	dkrS|	d  }| |}tt|t|@ t| }
|| d  |7  < || d  d7  < t|| d	 |
|| d	< || d
 	| q|S )Nr   )r   r   r   matched_factsrb   r'      r   r   r   r   r  )
	enumerater.   r-   rh   ri   rT   r   r   maxrG   )r0   r   r   r   r  idxscorefactr   
componentsr   code_refr   r   r   r   r     s2   



z,AsyncLLMTextManager._aggregate_search_scoresc                 C   s   | j dd dS )z&Convenience method for a full rebuild.Tr   N)r   r   r   r   r   refresh_index  s   z!AsyncLLMTextManager.refresh_index)Nr&   r'   )F)r   N)FF)r   )r&   )__name__
__module____qualname__r   r   r   r   r1   r   r`   rJ   r   boolrl   r   rz   r|   r   r   r   r   r   rM   floatr   r  r   r   r   r   r%       sH    
H 	"$,
%r%   )%r   pathlibr   rL   typingr   r   r   r   r   rs   r   r   r   numpyr   	rank_bm25r	   nltk.tokenizer
   nltk.corpusr   	nltk.stemr   litellmr   r   async_loggerr   r   r   r   r   set_verboserJ   r$   r%   r   r   r   r   <module>   s.    