
    ?i5                     t   d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	Z	ddl
mZmZ ddlmZ ddlmZmZmZmZmZmZ dddd	Zd
ddd	ZdZ eh d      Zi dddhdddhdh ddh dddhddhddhddhdddhdddhdd hd dhd!d"hd"d!hd#d$hd$d#hZd%ed&ee   fd'Zd(ed)ed&efd*Zd+efd,Z d&e!fd-Z"d.ed&efd/Z#	 d?d.ed0ed1ed2ed&eeef   f
d3Z$d4ed&efd5Z%d6ed7ed&ee   fd8Z&	 d@d9ee   d:e'd&eeee   f   fd;Z(	 d?d.ed0ed1ed2ed&eeef   f
d<Z)d=eeef   d&eeeef      fd>Z*y)Au.  YouTube search and transcript extraction via yt-dlp for /last30days v2.1.

Uses yt-dlp (https://github.com/yt-dlp/yt-dlp) for both YouTube search and
transcript extraction. No API keys needed — just have yt-dlp installed.

Inspired by Peter Steinberger's toolchain approach (yt-dlp + summarize CLI).
    N)ThreadPoolExecutoras_completed)Path)AnyDictListOptionalSetTuple
      (   )quickdefaultdeep         i  >,   aianatbebydoifinisitmemynoofonorsotoweallandarebutcanforgethashowitsnotthewasyoufromhavejustthatthiswhatwillwithyourabouthipraphiphophop>   rA   rD   rC   >   rA   rD   rB   js
javascriptts
typescriptai
artificialintelligencemlmachinelearningreactreactjssveltesveltejsvuevuejstextreturnc                 *   t        j                  dd| j                               j                         }|D ch c]  }|t        vst        |      dkD  s| }}t        |      }|D ]#  }|t        v s|j                  t        |          % |S c c}w )zLowercase, strip punctuation, remove stopwords, drop single-char tokens.
    Expands tokens with synonyms for better cross-domain matching.z[^\w\s]    )	resublowersplit	STOPWORDSlensetSYNONYMSupdate)rU   wordswtokensexpandedts         U/home/ubuntu/.openclaw/workspace/skills/last30days-official/scripts/lib/youtube_yt.py	_tokenizeri   E   s     FF:sDJJL1779EDA!9"4Q!aDFD6{H )=OOHQK() O Es   BBBquerytitlec                     t        |       }t        |      }|syt        ||z        }|t        |      z  }t        dt        d|            S )zCompute relevance as ratio of query tokens found in title.

    Uses ratio overlap (intersection / query_length) so short queries
    score higher when fully represented in the title. Floors at 0.1.
    g      ?g?g      ?)ri   r_   maxmin)rj   rk   q_tokenst_tokensoverlapratios         rh   _compute_relevancers   R   sN     HH(X%&Gc(m#EsCUO$$    msgc                     t         j                  j                  d|  d       t         j                  j                          y)zLog to stderr.z
[YouTube] 
N)sysstderrwriteflush)ru   s    rh   _logr|   c   s-    JJz#b)*JJrt   c                  0    t        j                  d      duS )z%Check if yt-dlp is available in PATH.yt-dlpN)shutilwhich rt   rh   is_ytdlp_installedr   i   s    <<!--rt   topicc                 \   | j                         j                         }g d}|D ]3  }|j                  |dz         s|t        |      d j                         }5 h d}|j	                         }|D cg c]	  }||vs| }}|rdj                  |      n|}|j                  d      S c c}w )zExtract core subject from verbose query for YouTube search.

    Strips meta/research words to keep only the core product/concept name,
    similar to bird_x.py's approach.
    )zwhat are the bestzwhat is the bestzwhat are the latestzwhat are people saying aboutzwhat do people think aboutzhow do i usez
how to usezhow tozwhat arezwhat isztips forzbest practices forrX   N>   newtopbestgoodnewsgreatviraladvicekillerlatestpromptrb   awesomehottestmethodspopularpromptsupdatesfeaturestrending	practices	prompting
approaches
strategiesrecommendationsz?!.)r\   strip
startswithr_   r]   joinrstrip)	r   rU   prefixespnoiserc   rd   filteredresults	            rh   _extract_core_subjectr   n   s     ;;= DH  )??1s7#A=&&(D)E JJLE 3aAUN3H3#+SXXhF== 4s   3	B)=B)	from_dateto_datedepthc                 j   t               sg ddS t        j                  |t        d         }t        |       }t	        d| d| d| d       dd	| d
| dddg}t        t        d      rt        j                  nd}	 t        j                  |t        j                  t        j                  d|      }	 |j                  d      \  }	}
	 |	xs dj1                         st	        d       dg iS g }|	j1                         j3                  d      D ]E  }|j1                         }|s	 t5        j6                  |      }|j                  dd      }|j                  d      xs d}|j                  d      xs d}|j                  d       xs d}|j                  d!d      }d}|r!t;        |      d"k(  r|dd#  d$|d#d%  d$|d%d"  }|j=                  ||j                  d&d      d'| |j                  d(|j                  d)d            ||||d*|j                  d+      t?        ||j                  d&d            d,|j                  d&|      dd-  d.	       H |D cg c]  }|d/   s	|d/   |k\  s| }}t;        |      d0k\  r|}t	        d1t;        |       d2       n$t	        d1t;        |       d3t;        |       d4       |jA                  d5 d6       d|iS # t        j                  $ r 	 t        j                  t        j                  |j                        t         j"                         n*# t$        t&        t(        f$ r |j+                          Y nw xY w|j-                  d       t	        d       g ddcY S w xY w# t.        $ r g ddcY S w xY w# t4        j8                  $ r Y w xY wc c}w )7a  Search YouTube via yt-dlp. No API key needed.

    Args:
        topic: Search topic
        from_date: Start date (YYYY-MM-DD)
        to_date: End date (YYYY-MM-DD)
        depth: 'quick', 'default', or 'deep'

    Returns:
        Dict with 'items' list of video metadata dicts.
    zyt-dlp not installed)itemserrorr   zSearching YouTube for 'z	' (since z, count=)r~   ytsearch:z--dump-json--no-warningsz--no-downloadsetsidNTstdoutry   rU   
preexec_fnx   timeoutr   zYouTube search timed out (120s)zSearch timed outzyt-dlp not found z!YouTube search returned 0 resultsr   rw   id
view_countr   
like_countcomment_countupload_dater      -   rk    https://www.youtube.com/watch?v=channeluploader)viewslikescommentsdurationz	YouTube: <   )	video_idrk   urlchannel_namedate
engagementr   	relevancewhy_relevantr   r   zFound z videos within date rangez	 videos (z  within date range, keeping all)c                     | d   d   S )Nr   r   r   )xs    rh   <lambda>z search_youtube.<locals>.<lambda>  s    Q|_W5 rt   )keyreverse)!r   DEPTH_CONFIGr/   r   r|   hasattrosr   
subprocessPopenPIPEcommunicateTimeoutExpiredkillpggetpgidpidsignalSIGTERMProcessLookupErrorPermissionErrorOSErrorkillwaitFileNotFoundErrorr   r]   jsonloadsJSONDecodeErrorr_   appendrs   sort)r   r   r   r   count
core_topiccmdpreexecprocr   ry   r   linevideor   r   r   r   r   date_strr   recents                         rh   search_youtuber      s   " &<==UL$;<E&u-J":,i	{(5'QR	ST 	
5':,'C #2x0biidG:????
		>!--c-:NFF Lb!01} E$$T* "zz|	JJt$E 99T2&YY|,1
YY|,1
		/27aiir2 3{+q0%bq/*!K!,<+=Q{1Q?O>PQH YYw+5hZ@!IIi:r1JK##)
 		*-+J		'28NO'		':(Fs(K'LM
 	)"J GA!F)&	Y0FaGFG
6{avc%j\!:;<vc%j\3v;-7WXY 
JJ5tJDUA (( 	>		"**TXX.?&A 		IIaI 23*<==	>  :&899: ## 		> Hsy   =6N 4K &N
N0&N0/N0M?.AL0/M?0$MM?M%M?<N >M??N NNN-,N-vtt_textc                 :   t        j                  dd| t         j                        }t        j                  dd|      }t        j                  dd|      }t        j                  dd|t         j                        }|j	                         j                  d      }t               }g }|D ]<  }|j	                         }|s||vs|j                  |       |j                  |       > t        j                  dd	d	j                  |            j	                         S )
z/Convert VTT subtitle format to clean plaintext.z^WEBVTT.*?\n\nr   )flagsz=\d{2}:\d{2}:\d{2}\.\d{3}\s*-->\s*\d{2}:\d{2}:\d{2}\.\d{3}.*\nz<[^>]+>z^\d+\s*$rw   z\s+rX   )
rZ   r[   DOTALL	MULTILINEr   r]   r`   addr   r   )r   rU   linesseenuniquer   strippeds          rh   
_clean_vttr    s     66#RCD66RTVX\]D66*b$'D66+r4r||<DJJLt$E5DF $::<,HHXMM(#	$
 66&#sxx/06688rt   r   temp_dirc                    ddddddddd	| d
d|  g}t        t        d      rt        j                  nd}	 t        j                  |t        j
                  t        j
                  d|      }	 |j                  d       	 t'        |      |  dz  }|j)                         s&t'        |      j+                  |  d      D ]  }|} n y	 |j-                  dd      }t/        |      }|j1                         }	t3        |	      t4        kD  rdj7                  |	dt4               dz   }|r|S dS # t        j                  $ r 	 t        j                  t        j                  |j                        t        j                         n*# t        t        t        f$ r |j!                          Y nw xY w|j#                  d       Y yw xY w# t$        $ r Y yw xY w# t        $ r Y yw xY w)zFetch auto-generated transcript for a YouTube video.

    Args:
        video_id: YouTube video ID
        temp_dir: Temporary directory for subtitle files

    Returns:
        Plaintext transcript string, or None if no captions available.
    r~   z--write-auto-subsz
--sub-langenz--sub-formatvttz--skip-downloadr   z-oz/%(id)sr   r   NTr      r   r   z.en.vttz*.vttzutf-8replace)encodingerrorsrX   z...)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   existsglob	read_textr  r]   r_   TRANSCRIPT_MAX_WORDSr   )
r   r  r   r   r   vtt_pathr   raw
transcriptrc   s
             rh   fetch_transcriptr    s    	d
'"
*8*5	C #2x0biidG????
	R( H~8*G 44H??h$$z%78 	AH	   ') D CJ E
5z((XXe$9%9:;eC
#:--A (( 			"**TXX.?&A 		IIaI 	    sf   6G .D* 	G *F?>AF ?F? $F'$F?&F''F?<G >F??G 	GG	GG	video_idsmax_workersc           
         | si S t        dt        |        d       i }t        j                         5 }t	        |      5 }| D ci c]  }|j                  t        ||      | }}t        |      D ]  }||   }	 |j                         ||<    	 ddd       ddd       t        d |j                         D              }t        d| dt        |        d       |S c c}w # t        $ r d||<   Y w xY w# 1 sw Y   gxY w# 1 sw Y   kxY w)zFetch transcripts for multiple videos in parallel.

    Args:
        video_ids: List of YouTube video IDs
        max_workers: Max parallel fetches

    Returns:
        Dict mapping video_id to transcript text (or None).
    zFetching transcripts for z videos)r  Nc              3   &   K   | ]	  }|sd   yw)rY   Nr   ).0vs     rh   	<genexpr>z-fetch_transcripts_parallel.<locals>.<genexpr>  s     /AQa/s   zGot transcripts for /)r|   r_   tempfileTemporaryDirectoryr   submitr  r   r   	Exceptionsumvalues)	r  r  resultsr  executorvidfuturesfuturegots	            rh   fetch_transcripts_parallelr&  b  s)    	$S^$4G	<=G		$	$	& ((K8 
	(H %  0#x@#EG  'w/ (fo(#)==?GCL(
	(( /)/
/CuAc)n%5W	=>N ! (#'GCL(
	( 
	(( (sY   D C6C$C6:C"C6DC6"C3	0C62C3	3C66C?	;DDc                 "   t        | |||      }|j                  dg       }|s|S t        j                  |t        d         }|d| D cg c]  }|d   	 }}t        |      }	|D ]!  }|d   }
|	j                  |
      }|xs d|d<   # d|iS c c}w )aN  Full YouTube search: find videos, then fetch transcripts for top results.

    Args:
        topic: Search topic
        from_date: Start date (YYYY-MM-DD)
        to_date: End date (YYYY-MM-DD)
        depth: 'quick', 'default', or 'deep'

    Returns:
        Dict with 'items' list. Each item has a 'transcript_snippet' field.
    r   r   Nr   r   transcript_snippet)r   r/   TRANSCRIPT_LIMITSr&  )r   r   r   r   search_resultr   transcript_limititemtop_idstranscriptsr"  r  s               rh   search_and_transcriber/    s    $ #5)WeDMgr*E ),,U4Ei4PQ,12C3C,DEDtJEGE,W5K  6: __S)
%/%52!"6
 U Fs   	Bresponsec                 &    | j                  dg       S )zzParse YouTube search response to normalized format.

    Returns:
        List of item dicts ready for normalization.
    r   )r/   )r0  s    rh   parse_youtube_responser2    s     <<$$rt   )r   )r   )+__doc__r   mathr   rZ   r   r   r   rx   r  concurrent.futuresr   r   pathlibr   typingr   r   r   r	   r
   r   r   r)  r  	frozensetr^   ra   strri   floatrs   r|   boolr   r   r   r  r  intr&  r/  r2  r   rt   rh   <module>r=     s     	 	    
  ?  8 8        		E8	E8 
# #	
 	<. 4& 	<. 4& 	<
( 	9j
! i[ y zl 
 
G9  eW!(
C 
CH 
%c %# %% %"c .D .
#  #  # T 	qqq q 	q
 
#s(^qh9 9 9,A.s A.c A.hsm A.L "Cy"" 
#x}
"R 	### # 	#
 
#s(^#L%T#s(^ %T#s(^8L %rt   