o
    h$                     @   s   d dl mZmZ d dlmZmZmZmZ ddlm	Z	 ddl
mZ ddlmZmZ d dlZd dlmZ ed	Zd
ededefddZG dd deZG dd deZdS )    )ABCabstractmethod)OptionalDictAnyTuple   )MarkdownGenerationResult)CustomHTML2Text)RelevantContentFilterBM25ContentFilterN)urljoinz+!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)baseurlreturnc                 C   sD   | dr|S | dr| dr| dd | S | | S t| |S )z"Fast URL joining for common cases.)http://https://mailto:z///N)
startswithendswithr   )r   r    r   b/var/www/Befach/backend/venv/lib/python3.10/site-packages/crawl4ai/markdown_generation_strategy.pyfast_urljoin   s   



r   c                   @   st   e Zd ZdZddee deeeef  fddZ	e
				dd	ed
edeeeef  dee dedefddZdS )MarkdownGenerationStrategyz7Abstract base class for markdown generation strategies.Ncontent_filteroptionsc                 C   s   || _ |pi | _d S N)r   r   selfr   r   r   r   r   __init__   s   z#MarkdownGenerationStrategy.__init__ Tcleaned_htmlbase_urlhtml2text_options	citationsr   c                 K   s   dS )z$Generate markdown from cleaned HTML.Nr   )r    r#   r$   r%   r   r&   kwargsr   r   r   generate_markdown   s   	z,MarkdownGenerationStrategy.generate_markdownNN)r"   NNT)__name__
__module____qualname____doc__r   r   r   strr   r!   r   boolr	   r(   r   r   r   r   r      s*    $r   c                       s   e Zd ZdZddee deeeef  f fddZ	dded	ed
e
eef fddZ					dded	edeeeef  deeeef  dee ded
efddZ  ZS )DefaultMarkdownGeneratoraw  
    Default implementation of markdown generation strategy.
    
    How it works:
    1. Generate raw markdown from cleaned HTML.
    2. Convert links to citations.
    3. Generate fit markdown if content filter is provided.
    4. Return MarkdownGenerationResult.
    
    Args:
        content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
        options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None.
        
    Returns:
        MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
    Nr   r   c                    s   t  || d S r   )superr!   r   	__class__r   r   r!   9   s   z!DefaultMarkdownGenerator.__init__r"   markdownr$   r   c                 C   sj  i }i }g }d}d}t |D ]{}||||   | \}	}
}|r9|
ds9|
|vr5t||
||
< ||
 }
|
|vrdg }|rF|| |	rQ|	|krQ||	 ||r[dd| ndf||
< |d7 }||
 d }||dds||	 d| d	nd
|	 d| d |	 }q|||d  d|}dg}|
dd t| dd dD  |d|fS )a"  
        Convert links in markdown to citations.
        
        How it works:
        1. Find all links in the markdown.
        2. Convert links to citations.
        3. Return converted markdown and references markdown.
        
        Note:
        This function uses a regex pattern to find links in markdown.
        
        Args:
            markdown (str): Markdown text.
            base_url (str): Base URL for URL joins.
            
        Returns:
            Tuple[str, str]: Converted markdown and references markdown.
        r   r   )r   r   r   z: z - r"   !   ⟨u   ⟩z![u   ⟩]Nz

## References

c                 s   s.    | ]\}\}}d | d| | dV  qdS )r6   u   ⟩ 
Nr   ).0r   numdescr   r   r   	<genexpr>o   s
    

zFDefaultMarkdownGenerator.convert_links_to_citations.<locals>.<genexpr>c                 S   s   | d d S )Nr   r   r   )xr   r   r   <lambda>q   s    zEDefaultMarkdownGenerator.convert_links_to_citations.<locals>.<lambda>)key)LINK_PATTERNfinditerappendstartgroupsr   r   joingroupendextendsorteditems)r    r4   r$   link_map	url_cachepartslast_endcountermatchtextr   titler:   r9   converted_text
referencesr   r   r   convert_links_to_citations<   s8   8


z3DefaultMarkdownGenerator.convert_links_to_citationsTr#   r%   r&   c              
   K   s  zt |d}ddddddddd}	|r|	| n|r!|	| n	| jr*|	| j |jdi |	 |s7d}n	t|ts@t|}z||}
W n ty` } zdt| }
W Y d}~nd}~ww |
d	d
}
|
}d}|rz
| 	|
|\}}W n ty } z|
}dt| }W Y d}~nd}~ww d}d}|s| j
rz|p| j
}||}ddd |D }||}W n ty } zdt| }d}W Y d}~nd}~ww t|
pd|pd|pd|pd|pddW S  ty	 } zdt| }t||ddddW  Y d}~S d}~ww )a  
        Generate markdown with citations from cleaned HTML.
        
        How it works:
        1. Generate raw markdown from cleaned HTML.
        2. Convert links to citations.
        3. Generate fit markdown if content filter is provided.
        4. Return MarkdownGenerationResult.
        
        Args:
            cleaned_html (str): Cleaned HTML content.
            base_url (str): Base URL for URL joins.
            html2text_options (Optional[Dict[str, Any]]): HTML2Text options.
            options (Optional[Dict[str, Any]]): Additional options for markdown generation.
            content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
            citations (bool): Whether to generate citations.
            
        Returns:
            MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
        )baseurlr   FT)
body_widthignore_emphasisignore_linksignore_imagesprotect_linkssingle_line_break	mark_codeescape_snobr"   z#Error converting HTML to markdown: Nz    ```z```zError generating citations: r7   c                 s   s    | ]}d  |V  qdS )z<div>{}</div>N)format)r8   sr   r   r   r;      s    z=DefaultMarkdownGenerator.generate_markdown.<locals>.<genexpr>zError generating fit markdown: )raw_markdownmarkdown_with_citationsreferences_markdownfit_markdownfit_htmlzError in markdown generation: r   )r
   updater   update_params
isinstancer.   handle	ExceptionreplacerT   r   filter_contentrD   r	   )r    r#   r$   r%   r   r   r&   r'   hdefault_optionsr`   era   rb   rc   filtered_html	error_msgr   r   r   r(   v   s   




z*DefaultMarkdownGenerator.generate_markdownr)   )r"   )r"   NNNT)r*   r+   r,   r-   r   r   r   r.   r   r!   r   rT   r/   r	   r(   __classcell__r   r   r2   r   r0   (   s0    ( <r0   )abcr   r   typingr   r   r   r   modelsr	   	html2textr
   content_filter_strategyr   r   reurllib.parser   compiler?   r.   r   r   r0   r   r   r   r   <module>   s    
