
    ?im                        d Z ddlZddlmZ ddlmZmZmZ h dZ	 	 	 ddeee	ef      deee	ef      de
d	e
d
e
dee	ee	   f   fdZdeee	ef      dee	   fdZdeee	ef      dee	   fdZdeee	ef      dee	   fdZy)zHEntity extraction from Phase 1 search results for supplemental searches.    N)Counter)AnyDictList>   xbbccnnjackmetaapplegithubgoogleopenairedditnytimesreuterstwitteryoutubeelonmuskverified	microsoft	wikipediasundarpichaiwashingtonpostreddit_itemsx_itemsmax_handlesmax_hashtagsmax_subredditsreturnc                 b    t        |      }t        |      }t        |       }|d| |d| |d| dS )a  Extract key entities from Phase 1 results for supplemental searches.

    Parses X results for @handles and #hashtags, Reddit results for subreddit
    names and cross-referenced communities.

    Args:
        reddit_items: Raw Reddit item dicts from Phase 1
        x_items: Raw X item dicts from Phase 1
        max_handles: Maximum handles to return
        max_hashtags: Maximum hashtags to return
        max_subreddits: Maximum subreddits to return

    Returns:
        Dict with keys: x_handles, x_hashtags, reddit_subreddits
    N)	x_handles
x_hashtagsreddit_subreddits)_extract_x_handles_extract_x_hashtags_extract_subreddits)r   r   r   r   r   handleshashtags
subredditss           Y/home/ubuntu/.openclaw/workspace/skills/last30days-official/scripts/lib/entity_extract.pyextract_entitiesr,      sL    , !)G"7+H$\2J \k*}-'8     c                    t               }| D ]  }|j                  dd      j                         j                  d      j	                         }|r|t
        vr||xx   dz  cc<   |j                  dd      }t        j                  d|      }|D ](  }|j	                         }|t
        vs||xx   dz  cc<   *  |j                         D 	cg c]  \  }}	|	 c}	}S c c}	}w )zExtract and rank @handles from X results.

    Sources handles from:
    1. author_handle field (who posted)
    2. @mentions in post text (who they're talking about/to)

    Returns handles ranked by frequency, filtered for generic accounts.
    author_handle @   textz@(\w{1,15}))	r   getstriplstriplowerGENERIC_HANDLESrefindallmost_common)
r   handle_countsitemauthorr3   mentionsmentionmention_lowerh_s
             r+   r%   r%   2   s     IM 2/2.446==cBHHJfO3&!Q&! xx#::nd3 	2G#MMOMO3m,1,	22 (3356$!QA666s   Cc                    t               }| D ]L  }|j                  dd      }t        j                  d|      }|D ]  }||j	                         xx   dz  cc<    N |j                         D cg c]
  \  }}d|  c}}S c c}}w )zZExtract and rank #hashtags from X results.

    Returns hashtags ranked by frequency.
    r3   r0   z#(\w{2,30})r2   #)r   r4   r9   r:   r7   r;   )r   hashtag_countsr=   r3   tagstagtrC   s           r+   r&   r&   O   s    
 YN -xx#zz.$/ 	-C399;'1,'	-- !/ : : <=1asG===s   /Bc                    t               }| D ]  }|j                  dd      j                         j                  d      }|r||xx   dz  cc<   |j                  dg       D ],  }t	        j
                  d|      }|D ]  }||xx   dz  cc<    . |j                  dg       D ]>  }|j                  dd      }t	        j
                  d|      }|D ]  }||xx   dz  cc<    @  |j                         D 	cg c]  \  }}	|	 c}	}S c c}	}w )	zExtract and rank subreddits from Reddit results.

    Sources from:
    1. subreddit field on each result
    2. Cross-references in comment text (e.g., "check out r/localLLaMA")

    Returns subreddits ranked by frequency.
    	subredditr0   zr/r2   comment_insightszr/(\w{2,30})top_commentsexcerpt)r   r4   r5   r6   r9   r:   r;   )
r   
sub_countsr=   subinsight
cross_refsrefcommentrN   rC   s
             r+   r'   r'   `   s    J %hh{B'--/66t<sOq O xx 2B7 	%GOW=J! %31$%	% xx3 	%Gkk)R0GOW=J! %31$%	%%( )4467FCC777s   7D)      rU   )__doc__r9   collectionsr   typingr   r   r   r8   strintr,   r%   r&   r'    r-   r+   <module>r]      s    N 	  " " tCH~&$sCx.!  	
  
#tCy.B7T#s(^ 4 7c 7:>d38n!5 >$s) >"8d4S>&: 8tCy 8r-   