
    ?i-                        d Z ddlZddlmZmZ ddlmZmZmZmZm	Z	 ddl
mZ ddlmZ i ddd	dd
dddddddddddddddddddddddddddd ddd!d!d"d"d#d#d$Zd%ed&ee   fd'Zd(ed&ee   fd)Zd%ed*ed+ed&e	ee   ef   fd,Zh d-Zd%ed&efd.Zd%ed&efd/Z	 	 d8d0eeeef      d1ed2ed3ed&eeeef      f
d4Zd5eeeef      d2ed3ed&eej0                     fd6Zd5eej0                     d&eej0                     fd7Zy)9a  WebSearch module for last30days skill.

NOTE: WebSearch uses the assistant's built-in web search tool, which runs inside the host environment.
Unlike Reddit/X which use external APIs, web search results are obtained by the assistant
directly and passed to this module for normalization and scoring.

The typical flow is:
1. The assistant invokes its web search tool with the topic
2. The assistant passes results to parse_websearch_results()
3. Results are normalized into WebSearchItem objects
    N)datetime	timedelta)AnyDictListOptionalTuple)urlparse   )schemajanjanuaryfeb   februarymar   marchapr   aprilmay   jun   junejul   julyaug   augustsep	   sept
         )	septemberoctoctobernovnovemberdecdecemberurlreturnc                    t        j                  d|       }|rc|j                         \  }}}dt        |      cxk  rdk  r;n n8dt        |      cxk  rdk  r$n n!dt        |      cxk  rdk  rn n
| d| d| S t        j                  d|       }|rc|j                         \  }}}dt        |      cxk  rdk  r;n n8dt        |      cxk  rdk  r$n n!dt        |      cxk  rdk  rn n
| d| d| S t        j                  d	|       }|rc|j                         \  }}}dt        |      cxk  rdk  r;n y
dt        |      cxk  rdk  r$n y
dt        |      cxk  rdk  rn y
| d| d| S y
)a  Try to extract a date from URL path.

    Many sites embed dates in URLs like:
    - /2026/01/24/article-title
    - /2026-01-24/article
    - /blog/20260124/title

    Args:
        url: URL to parse

    Returns:
        Date string in YYYY-MM-DD format, or None
    z/(\d{4})/(\d{2})/(\d{2})/    r   r(      -z/(\d{4})-(\d{2})-(\d{2})[-/]z/(\d{4})(\d{2})(\d{2})/N)researchgroupsint)r0   matchyearmonthdays        T/home/ubuntu/.openclaw/workspace/skills/last30days-official/scripts/lib/websearch.pyextract_date_from_urlr@   &   sq    II2C8E <<>eS3t9$$c%j)>B)>1CCVTVCVV1UG1SE** II5s;E <<>eS3t9$$c%j)>B)>1CCVTVCVV1UG1SE** II0#6E <<>eS3t9$$  *+c%j)>B)>  DECCVTVCV  V1UG1SE**    textc                 n   | sy| j                         }t        j                  d|      }|rq|j                         \  }}}t        j                  |dd       }|rCdt        |      cxk  rdk  r/n n,dt        |      cxk  rdk  rn n| d|d	dt        |      d	S t        j                  d
|      }|rq|j                         \  }}}t        j                  |dd       }|rCdt        |      cxk  rdk  r/n n,dt        |      cxk  rdk  rn n| d|d	dt        |      d	S t        j                  d|       }|rc|j                         \  }}}dt        |      cxk  rdk  r;n n8dt        |      cxk  rdk  r$n n!dt        |      cxk  rdk  rn n
| d| d| S t        j                         }d|v r |t        d      z
  }|j                  d      S d|v r|j                  d      S t        j                  d|      }|r?t        |j                  d            }	|	dk  r |t        |	      z
  }|j                  d      S t        j                  d|      }|r|j                  d      S d|v r |t        d      z
  }|j                  d      S d|v r |t        d      z
  }|j                  d      S y)a6  Try to extract a date from text snippet or title.

    Looks for patterns like:
    - January 24, 2026 or Jan 24, 2026
    - 24 January 2026
    - 2026-01-24
    - "3 days ago", "yesterday", "last week"

    Args:
        text: Text to parse

    Returns:
        Date string in YYYY-MM-DD format, or None
    Nz\b(jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:t(?:ember)?)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\s+(\d{1,2})(?:st|nd|rd|th)?,?\s*(\d{4})\br   r3   r4   r   r5   r6   02dz\b(\d{1,2})(?:st|nd|rd|th)?\s+(jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:t(?:ember)?)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\s+(\d{4})\bz\b(\d{4})-(\d{2})-(\d{2})\br(   	yesterday)daysz%Y-%m-%dtodayz\b(\d+)\s*days?\s*ago\b<   z\b(\d+)\s*hours?\s*ago\bz	last weekr   z	this week)lowerr7   r8   r9   	MONTH_MAPgetr:   r   nowr   strftimegroup)
rB   
text_lowerr;   	month_strr>   r<   r=   rG   daterF   s
             r?   extract_date_from_snippetrR   L   s    J II	6 		E $||~	3im,TSY.$.1C3FB3FV1U3KqS#77 II	 	E $||~Yim,TSY.$.1C3FB3FV1U3KqS#77 II4d;E <<>eS3t9$$c%j)>B)>1CCVTVCVV1UG1SE** LLNEj ya((}}Z((*~~j)) II0*=E5;;q>"2:9$//D==,, II1:>E~~j)) j ya((}}Z(( j ya((}}Z((rA   snippettitlec                 j    t        |       }|r|dfS t        |      }|r|dfS t        |      }|r|dfS y)a  Extract date from any available signal.

    Tries URL first (most reliable), then snippet, then title.

    Args:
        url: Page URL
        snippet: Page snippet/description
        title: Page title

    Returns:
        Tuple of (date_string, confidence)
        - date from URL: 'high' confidence
        - date from snippet/title: 'med' confidence
        - no date found: None, 'low' confidence
    highmed)Nlow)r@   rR   )r0   rS   rT   url_datesnippet_date
title_dates         r?   extract_date_signalsr\      sS    * %S)H -W5LU"" +51J5  rA   >   x.com	www.x.com
reddit.comtwitter.comold.reddit.comwww.reddit.comwww.twitter.commobile.twitter.comc                     	 t        |       }|j                  j                         }|j                  d      r|dd }|S # t        $ r Y yw xY w)z}Extract the domain from a URL.

    Args:
        url: Full URL

    Returns:
        Domain string (e.g., "medium.com")
    zwww.r   N )r
   netlocrI   
startswith	Exceptionr0   parseddomains      r?   extract_domainrm      sR    #$$&V$ABZF s   <? 	A
Ac                 |    	 t        |       }|j                  j                         }|t        v S # t        $ r Y yw xY w)zCheck if URL is from an excluded domain (Reddit/X).

    Args:
        url: URL to check

    Returns:
        True if URL should be excluded
    F)r
   rg   rI   EXCLUDED_DOMAINSri   rj   s      r?   is_excluded_domainrp      s@    #$$&))) s   ,/ 	;;resultstopic	from_dateto_datec                 n   g }t        |       D ]  \  }}t        |t              s|j                  dd      }|s-t	        |      r9t        |j                  dd            j                         }t        |j                  d|j                  dd                  j                         }	|s|	s|j                  d      }
d}|
r"t        j                  dt        |
            rd	}nt        ||	|      \  }}|r|}
|}|
r|r|
|k  r|
r	|r|
|kD  r|j                  d
d      }	 t        dt        dt        |                  }d|dz    |dd |t        |      |	dd |
||t        |j                  dd            j                         d	}|j!                  |        |S # t        t        f$ r d}Y qw xY w)a  Parse WebSearch results into normalized format.

    This function expects results from Claude's WebSearch tool.
    Each result should have: title, url, snippet, and optionally date/relevance.

    Uses "Date Detective" approach:
    1. Extract dates from URLs (high confidence)
    2. Extract dates from snippets/titles (med confidence)
    3. Hard filter: exclude items with verified old dates
    4. Keep items with no date signals (with low confidence penalty)

    Args:
        results: List of WebSearch result dicts
        topic: Original search topic (for context)
        from_date: Start date for filtering (YYYY-MM-DD)
        to_date: End date for filtering (YYYY-MM-DD)

    Returns:
        List of normalized item dicts ready for WebSearchItem creation
    r0   rf   rT   rS   descriptionrQ   rX   z^\d{4}-\d{2}-\d{2}$rW   	relevance      ?g      ?g        Wr   N   i  why_relevant	idrT   r0   source_domainrS   rQ   date_confidencerw   r{   )	enumerate
isinstancedictrK   rp   strstripr7   r;   r\   minmaxfloat	TypeError
ValueErrorrm   append)rq   rr   rs   rt   itemsiresultr0   rT   rS   rQ   r   extracted_date
confidencerw   items                   r?   parse_websearch_resultsr      s   4 Ew' ;	6&$'jj# c"FJJw+,224fjjFJJ}b,IJKQQSW zz&!BHH3SY?#O *>c7E)R&NJ%", I$"2 Gw JJ{C0		CS%	*:!;<I
 acU)4C[+C0t}."

>2 >?EEG

 	Tw;z L# :& 	I	s   #F  F43F4r   c                    g }| D ]  }t        j                  |d   |d   |d   |d   |d   |j                  d      |j                  dd      |j                  d	d
      |j                  dd      	      }|j                  |        |S )zConvert parsed dicts to WebSearchItem objects.

    Args:
        items: List of parsed item dicts
        from_date: Start of date range (YYYY-MM-DD)
        to_date: End of date range (YYYY-MM-DD)

    Returns:
        List of WebSearchItem objects
    r}   rT   r0   r~   rS   rQ   r   rX   rw   rx   r{   rf   r|   )r   WebSearchItemrK   r   )r   rs   rt   r   r   web_items         r?   normalize_websearch_itemsr   Z  s     F  ''Dzw-U/O&! HH%6>hh{C0."5

 	h  MrA   c                     t               }g }| D ]R  }|j                  j                         j                  d      }||vs1|j	                  |       |j                  |       T |S )zRemove duplicate WebSearch items.

    Deduplication is based on URL.

    Args:
        items: List of WebSearchItem objects

    Returns:
        Deduplicated list
    /)setr0   rI   rstripaddr   )r   	seen_urlsr   r   url_keys        r?   dedupe_websearchr   |  sa     IF  ((.."))#.)#MM'"MM$  MrA   )rf   rf   )__doc__r7   r   r   typingr   r   r   r   r	   urllib.parser
   rf   r   rJ   r   r@   rR   r\   ro   rm   boolrp   r   r   r   r    rA   r?   <module>r      sb  
 
 ( 3 3 ! 	1	1! 
1 q 
1	 q	
 
1 
1 a 
1 a 
1  
1 a '("22	 #s #x} #LWC WHSM Wt#	## # 8C=#	#N	   (C D ( 	Y$sCx.!YY Y 	Y
 
$sCx.YxS#X  
&

	DD!5!56 4@T@T;U rA   