
    ?i!              
          d Z ddlZddlmZmZmZmZ ddlmZ  e	h d      Z
dedefd	Zd&ded
edee   fdZdee   dee   defdZeej"                  ej$                  ej&                  ej(                  ej*                  ej,                  ej.                  ej0                  f   ZdedefdZdedefdZdedee   fdZdededefdZdededefdZ	 d'deeej"                  ej$                  f      dedeeeef      fdZ	 d'deeej"                  ej$                  f      dedeeej"                  ej$                  f      fdZ 	 d'deej"                     dedeej"                     fdZ!	 d'deej$                     dedeej$                     fdZ"	 d'deej&                     dedeej&                     fdZ#	 d'deej(                     dedeej(                     fdZ$	 d'deej*                     dedeej*                     fdZ%	 d'deej,                     dedeej,                     fd Z&	 d'deej.                     dedeej.                     fd!Z'd"d#d$ee   deddfd%Z(y)(z.Near-duplicate detection for last30days skill.    N)ListSetTupleUnion   )schema>.   aianatbebydohnifinisitmemynoofonorsotoweallandarebutcanforgethashowitsnotthewasyoufromhavejustshowthatthiswhatwillwithyourabouttextreturnc                     | j                         } t        j                  dd|       } t        j                  dd|       } | j                         S )zgNormalize text for comparison.

    - Lowercase
    - Remove punctuation
    - Collapse whitespace
    [^\w\s] z\s+)lowerresubstrip)r7   s    Q/home/ubuntu/.openclaw/workspace/skills/last30days-official/scripts/lib/dedupe.pynormalize_textrA      s?     ::<D66*c4(D66&#t$D::<    nc                     t        |       } t        |       |k  r| hS t        t        |       |z
  dz         D ch c]
  }| |||z     c}S c c}w )z Get character n-grams from text.r   )rA   lenrange)r7   rC   r
   s      r@   
get_ngramsrG      sN    $D
4y1}v!&s4y1}q'8!9:AD1Q3K:::s   Aset1set2c                 \    | r|syt        | |z        }t        | |z        }|dkD  r||z  S dS )z,Compute Jaccard similarity between two sets.        r   )rE   )rH   rI   intersectionunions       r@   jaccard_similarityrN   '   s<    ttd{#LtE#(19<%5#5rB   itemc                    t        | t        j                        r| j                  S t        | t        j                        r| j                  S t        | t        j
                        r| j                   d| j                   S t        | t        j                        r| j                   d| j                   S t        | t        j                        r| j                   d| j                   S t        | t        j                        r| j                   d| j                   S t        | t        j                        r| j                  S | j                  S )z!Get comparable text from an item.r;   )
isinstancer   
RedditItemtitleHackerNewsItemYouTubeItemchannel_name
TikTokItemr7   author_nameInstagramItemPolymarketItemquestionWebSearchItem)rO   s    r@   get_item_textr]   4   s   $))*zz	D&//	0zz	D&,,	-**Qt00122	D&++	,))Ad../00	D&..	/))Ad../00	D&//	0**Qt}}o..	D&..	/zzyyrB   c                 >   t        | t        j                        r| j                  dd S t        | t        j                        r| j                  dd S t        | t        j
                        r| j                  dd S t        | t        j                        rX| j                  }|j                  d      r|dd j                         }|S |j                  d      r|dd j                         }|S t        | t        j                        r| j                  S t        |       S )zGet text for cross-source comparison.

    Same as get_item_text() but truncates X posts to 100 chars
    to level the playing field against short Reddit/HN titles.
    Strips 'Show HN:' prefix from HN titles for fairer matching.
    Nd   zShow HN:   zAsk HN:   )rQ   r   XItemr7   rW   rY   rT   rS   
startswithr?   rZ   r]   )rO   rS   s     r@   _get_cross_source_textrd   H   s     $%yy#$))*yy#$,,-yy#$--.

J'!"IOO%E  i(!"IOO%E$--.zzrB   c                     t        j                  dd| j                               j                         }|D ch c]  }|t        vst        |      dkD  s| c}S c c}w )z8Tokenize text for cross-source token Jaccard comparison.r:   r;   r   )r=   r>   r<   split	STOPWORDSrE   )r7   wordsws      r@   _tokenize_for_xrefrj   a   sH    FF:sDJJL1779EB! 2s1vzABBBs   AAAtext_atext_bc                     t        |       }t        |      }|r|syt        ||z        }t        ||z        }|r||z  S dS )z.Token-level Jaccard similarity (word overlap).rK   )rj   rE   )rk   rl   tokens_atokens_brL   rM   s         r@   _token_jaccardrp   g   sN    !&)H!&)H8x(*+L8#$E#(<%1c1rB   c                 n    t        t        |       t        |            }t        | |      }t        ||      S )zAHybrid similarity: max of char-trigram Jaccard and token Jaccard.)rN   rG   rp   max)rk   rl   trigram_sim	token_sims       r@   _hybrid_similarityru   r   s1    $Z%7F9KLKvv.I{I&&rB   items	thresholdc                    g }| D cg c]  }t        t        |             }}t        t        |             D ]J  }t        |dz   t        |             D ]-  }t	        ||   ||         }||k\  s|j                  ||f       / L |S c c}w )zFind near-duplicate pairs in items.

    Args:
        items: List of items to check
        threshold: Similarity threshold (0-1)

    Returns:
        List of (i, j) index pairs where i < j and items are similar
    r   )rG   r]   rF   rE   rN   append)rv   rw   
duplicatesrO   ngramsr
   j
similaritys           r@   find_duplicatesr~   y   s     J ;@@$jt,-@F@3u: *q1uc%j) 	*A+F1IvayAJY&!!1a&)	**  As   Bc                 8   t        |       dk  r| S t        | |      }t               }|D ]G  \  }}| |   j                  | |   j                  k\  r|j	                  |       7|j	                  |       I t        |       D cg c]  \  }}||vs| c}}S c c}}w )zRemove near-duplicates, keeping highest-scored item.

    Args:
        items: List of items (should be pre-sorted by score descending)
        threshold: Similarity threshold

    Returns:
        Deduplicated items
    r   )rE   r~   setscoreadd	enumerate)rv   rw   	dup_pairs	to_remover
   r|   idxrO   s           r@   dedupe_itemsr      s     5zQ  y1I I 18>>U1X^^+MM!MM! #,E"2KYS$c6JDKKKs   BBc                     t        | |      S )zDedupe Reddit items.r   rv   rw   s     r@   dedupe_redditr          
 y))rB   c                     t        | |      S )zDedupe X items.r   r   s     r@   dedupe_xr      r   rB   c                     t        | |      S )zDedupe YouTube items.r   r   s     r@   dedupe_youtuber      r   rB   c                     t        | |      S )zDedupe TikTok items.r   r   s     r@   dedupe_tiktokr      r   rB   c                     t        | |      S )zDedupe Instagram items.r   r   s     r@   dedupe_instagramr      r   rB   c                     t        | |      S )zDedupe Hacker News items.r   r   s     r@   dedupe_hackernewsr      r   rB   c                     t        | |      S )zDedupe Polymarket items.r   r   s     r@   dedupe_polymarketr      r   rB   g?)rw   source_listsc                    g }|D ]  }|j                  |        t        |      dk  ry|D cg c]  }t        |       }}t        t        |            D ]  }t        |dz   t        |            D ]  }t	        ||         t	        ||         u r t        ||   ||         }|| k\  s8||   j                  ||   j                  vr+||   j                  j                  ||   j                         ||   j                  ||   j                  vs||   j                  j                  ||   j                           yc c}w )a  Annotate items with cross-source references.

    Compares items across different source types using hybrid similarity
    (max of char-trigram Jaccard and token Jaccard). When similarity exceeds
    threshold, adds bidirectional cross_refs with the related item's ID.
    Modifies items in-place.

    Args:
        *source_lists: Variable number of per-source item lists
        threshold: Similarity threshold for cross-linking (default 0.40)
    r   N)	extendrE   rd   rF   typeru   id
cross_refsry   )	rw   r   	all_itemssource_listrO   textsr
   r|   r}   s	            r@   cross_source_linkr      s@    I# &%& 9~ 7@@d#D)@E@3y>" Dq1uc)n- 	DAIaL!T)A,%77+E!HeAh?JY&Q<??)A,*A*AAaL++229Q<??CQ<??)A,*A*AAaL++229Q<??C	DD As   E)   )gffffff?))__doc__r=   typingr   r   r   r    r   	frozensetrg   strrA   intrG   floatrN   rR   rb   rU   rW   rY   rT   rZ   r\   AnyItemr]   rd   rj   rp   ru   r~   r   r   r   r   r   r   r   r   r    rB   r@   <module>r      s   4 	 * *    	
 
 
;S ;S ;S ;6SX 6SX 6% 6 !!6<<1C1CVEVEV$$f&;&;V=R=RTZThThi j C ( S 2CS CSX C23 2 2 2's 'C 'E ' f''567 
%S/: Lf''567LL 
%!!6<</
01LF *!!"** 
&

* *** 
&,,* *""#** 
&

* *!!"** 
&

* *$$%** 
&

* *%%&** 
&

 * *%%&** 
&

 * %D=%D%D 
%DrB   