o
    ›Œh‰  ã                   @   sÄ   d dl mZmZ d dlZd dlmZ d dlZddlmZ ddl	T G dd„ deƒZ
G d	d
„ d
e
ƒZG dd„ de
ƒZG dd„ de
ƒZG dd„ de
ƒZG dd„ de
ƒZG dd„ de
ƒZG dd„ de
ƒZdS )é    )ÚABCÚabstractmethodN)ÚCounteré   ©Úload_nltk_punkt)Ú*c                   @   s&   e Zd ZdZededefdd„ƒZdS )ÚChunkingStrategyz6
    Abstract base class for chunking strategies.
    ÚtextÚreturnc                 C   s   dS )zº
        Abstract method to chunk the given text.
        
        Args:
            text (str): The text to chunk.
        
        Returns:
            list: A list of chunks.
        N© ©Úselfr
   r   r   úW/var/www/Befach/backend/venv/lib/python3.10/site-packages/crawl4ai/chunking_strategy.pyÚchunk   s   zChunkingStrategy.chunkN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   ÚstrÚlistr   r   r   r   r   r	   	   s    r	   c                   @   s"   e Zd ZdZdedefdd„ZdS )ÚIdentityChunkingzJ
    Chunking strategy that returns the input text as a single chunk.
    r
   r   c                 C   s   |gS ©Nr   r   r   r   r   r       s   zIdentityChunking.chunkN)r   r   r   r   r   r   r   r   r   r   r   r      s    r   c                   @   s,   e Zd ZdZd	dd„Zdedefdd„ZdS )
ÚRegexChunkingzR
    Chunking strategy that splits text based on regular expression patterns.
    Nc                 K   s   |du rdg}|| _ dS )zŸ
        Initialize the RegexChunking object.
        
        Args:
            patterns (list): A list of regular expression patterns to split text.
        Nz\n\n)Úpatterns)r   r   Úkwargsr   r   r   Ú__init__(   s   
zRegexChunking.__init__r
   r   c                 C   s:   |g}| j D ]}g }|D ]}| t ||¡¡ q|}q|S r   )r   ÚextendÚreÚsplit)r   r
   Ú
paragraphsÚpatternÚnew_paragraphsÚ	paragraphr   r   r   r   3   s   
zRegexChunking.chunkr   ©r   r   r   r   r   r   r   r   r   r   r   r   r   $   s    
r   c                   @   s*   e Zd ZdZdd„ Zdedefdd„ZdS )	ÚNlpSentenceChunkingz\
    Chunking strategy that splits text into sentences using NLTK's sentence tokenizer.
    c                 K   s
   t ƒ  dS )z<
        Initialize the NlpSentenceChunking object.
        Nr   )r   r   r   r   r   r   A   s   
zNlpSentenceChunking.__init__r
   r   c                 C   s.   ddl m} ||ƒ}dd„ |D ƒ}tt|ƒƒS )Nr   )Úsent_tokenizec                 S   s   g | ]}|  ¡ ‘qS r   )Ústrip)Ú.0Úsentr   r   r   Ú
<listcomp>Q   ó    z-NlpSentenceChunking.chunk.<locals>.<listcomp>)Únltk.tokenizer&   r   Úset)r   r
   r&   Ú	sentencesÚsensr   r   r   r   H   s   zNlpSentenceChunking.chunkNr$   r   r   r   r   r%   =   s    r%   c                   @   sP   e Zd ZdZddd„Zdedefdd„Zdedefd	d
„Zdedefdd„Z	dS )ÚTopicSegmentationChunkingzà
    Chunking strategy that segments text into topics using NLTK's TextTilingTokenizer.
    
    How it works:
    1. Segment the text into topics using TextTilingTokenizer
    2. Extract keywords for each topic segment
    é   c                 K   s   ddl }|j ¡ | _|| _dS )z³
        Initialize the TopicSegmentationChunking object.
        
        Args:
            num_keywords (int): The number of keywords to extract for each topic segment.
        r   N)ÚnltkÚtokenizeÚTextTilingTokenizerÚ	tokenizerÚnum_keywords)r   r6   r   Únlr   r   r   r   _   s   
z"TopicSegmentationChunking.__init__r
   r   c                 C   s   | j  |¡}|S r   )r5   r3   )r   r
   Úsegmented_topicsr   r   r   r   j   s   zTopicSegmentationChunking.chunkc                    sH   dd l ‰ ˆ j |¡}‡ fdd„|D ƒ}t|ƒ}dd„ | | j¡D ƒ}|S )Nr   c                    s0   g | ]}|ˆ j j d ¡vr|tjvr| ¡ ‘qS )Úenglish)ÚcorpusÚ	stopwordsÚwordsÚstringÚpunctuationÚlower)r(   Útoken©r7   r   r   r*   s   s   0 z>TopicSegmentationChunking.extract_keywords.<locals>.<listcomp>c                 S   s   g | ]\}}|‘qS r   r   )r(   ÚwordÚfreqr   r   r   r*   w   r+   )r2   ÚtoknizeÚword_tokenizer   Úmost_commonr6   )r   r
   ÚtokensÚ	freq_distÚkeywordsr   rA   r   Úextract_keywordso   s   z*TopicSegmentationChunking.extract_keywordsc                    s    ˆ   |¡}‡ fdd„|D ƒ}|S )Nc                    s   g | ]	}|ˆ   |¡f‘qS r   )rJ   )r(   Úsegment©r   r   r   r*   ~   s    z?TopicSegmentationChunking.chunk_with_topics.<locals>.<listcomp>)r   )r   r
   ÚsegmentsÚsegments_with_topicsr   rL   r   Úchunk_with_topicsz   s   
z+TopicSegmentationChunking.chunk_with_topicsN)r1   )
r   r   r   r   r   r   r   r   rJ   rO   r   r   r   r   r0   V   s    
r0   c                   @   s,   e Zd ZdZd
dd„Zdedefdd„Zd	S )ÚFixedLengthWordChunkingzÉ
    Chunking strategy that splits text into fixed-length word chunks.
    
    How it works:
    1. Split the text into words
    2. Create chunks of fixed length
    3. Return the list of chunks
    éd   c                 K   s
   || _ dS )zµ
        Initialize the fixed-length word chunking strategy with the given chunk size.
        
        Args:
            chunk_size (int): The size of each chunk in words.
        N)Ú
chunk_size)r   rR   r   r   r   r   r   ‹   s   
z FixedLengthWordChunking.__init__r
   r   c                    s*   |  ¡ ‰‡ ‡fdd„tdtˆƒˆ jƒD ƒS )Nc                    s$   g | ]}d   ˆ||ˆ j … ¡‘qS )ú )ÚjoinrR   )r(   Úi©r   r<   r   r   r*   –   s   $ z1FixedLengthWordChunking.chunk.<locals>.<listcomp>r   )r   ÚrangeÚlenrR   r   r   rV   r   r   ”   s   "zFixedLengthWordChunking.chunkN)rQ   r$   r   r   r   r   rP   ‚   s    
	rP   c                   @   ó,   e Zd ZdZddd„Zdedefdd	„Zd
S )ÚSlidingWindowChunkingzÈ
    Chunking strategy that splits text into overlapping word chunks.
    
    How it works:
    1. Split the text into words
    2. Create chunks of fixed length
    3. Return the list of chunks
    rQ   é2   c                 K   ó   || _ || _dS )a  
        Initialize the sliding window chunking strategy with the given window size and
        step size.
        
        Args:
            window_size (int): The size of the sliding window in words.
            step (int): The step size for sliding the window in words.
        N)Úwindow_sizeÚstep)r   r]   r^   r   r   r   r   r   ¢   ó   	
zSlidingWindowChunking.__init__r
   r   c                 C   s–   |  ¡ }g }t|ƒ| jkr|gS tdt|ƒ| j d | jƒD ]}d |||| j … ¡}| |¡ q|| j t|ƒk rI| d || j d … ¡¡ |S )Nr   r   rS   )r   rX   r]   rW   r^   rT   Úappend)r   r
   r<   ÚchunksrU   r   r   r   r   r   ®   s    zSlidingWindowChunking.chunkN)rQ   r[   r$   r   r   r   r   rZ   ™   s    
rZ   c                   @   rY   )ÚOverlappingWindowChunkinga  
    Chunking strategy that splits text into overlapping word chunks.
    
    How it works:
    1. Split the text into words using whitespace
    2. Create chunks of fixed length equal to the window size
    3. Slide the window by the overlap size
    4. Return the list of chunks
    éè  rQ   c                 K   r\   )a)  
        Initialize the overlapping window chunking strategy with the given window size and
        overlap size.
        
        Args:
            window_size (int): The size of the window in words.
            overlap (int): The size of the overlap between consecutive chunks in words.
        N)r]   Úoverlap)r   r]   rd   r   r   r   r   r   É   r_   z"OverlappingWindowChunking.__init__r
   r   c                 C   s‚   |  ¡ }g }t|ƒ| jkr|gS d}|t|ƒk r?|| j }d |||… ¡}| |¡ |t|ƒkr4	 |S || j }|t|ƒk s|S )Nr   rS   )r   rX   r]   rT   r`   rd   )r   r
   r<   ra   ÚstartÚendr   r   r   r   r   Õ   s   


þø
zOverlappingWindowChunking.chunkN)rc   rQ   r$   r   r   r   r   rb   ¿   s    
	rb   )Úabcr   r   r   Úcollectionsr   r=   Úmodel_loaderr   Úutilsr	   r   r   r%   r0   rP   rZ   rb   r   r   r   r   Ú<module>   s    ,&