From 831b7a6dc4f10cdcb17f2c09b17f75839e9643de Mon Sep 17 00:00:00 2001 From: Tao Wang Date: Fri, 15 Nov 2024 02:03:40 -0800 Subject: [PATCH] Integrate the youtube-transcript-api library as a tool --- .../builtin/transcript/_assets/icon.svg | 11 ++ .../builtin/transcript/tools/transcript.py | 84 +++++++++++++++ .../builtin/transcript/tools/transcript.yaml | 101 ++++++++++++++++++ .../provider/builtin/transcript/transcript.py | 12 +++ .../builtin/transcript/transcript.yaml | 14 +++ api/poetry.lock | 18 +++- api/pyproject.toml | 1 + 7 files changed, 239 insertions(+), 2 deletions(-) create mode 100644 api/core/tools/provider/builtin/transcript/_assets/icon.svg create mode 100644 api/core/tools/provider/builtin/transcript/tools/transcript.py create mode 100644 api/core/tools/provider/builtin/transcript/tools/transcript.yaml create mode 100644 api/core/tools/provider/builtin/transcript/transcript.py create mode 100644 api/core/tools/provider/builtin/transcript/transcript.yaml diff --git a/api/core/tools/provider/builtin/transcript/_assets/icon.svg b/api/core/tools/provider/builtin/transcript/_assets/icon.svg new file mode 100644 index 0000000000..83b0700fec --- /dev/null +++ b/api/core/tools/provider/builtin/transcript/_assets/icon.svg @@ -0,0 +1,11 @@ + + + + + + + + + \ No newline at end of file diff --git a/api/core/tools/provider/builtin/transcript/tools/transcript.py b/api/core/tools/provider/builtin/transcript/tools/transcript.py new file mode 100644 index 0000000000..d4e8ea8eef --- /dev/null +++ b/api/core/tools/provider/builtin/transcript/tools/transcript.py @@ -0,0 +1,84 @@ +from typing import Any, Dict, Union, List +from urllib.parse import urlparse, parse_qs +from youtube_transcript_api import YouTubeTranscriptApi +from core.tools.tool.builtin_tool import BuiltinTool +from core.tools.entities.tool_entities import ToolInvokeMessage + +class YouTubeTranscriptTool(BuiltinTool): + def _invoke(self, user_id: str, tool_parameters: Dict[str, Any]) -> Union[ToolInvokeMessage, List[ToolInvokeMessage]]: + """ + Invoke the YouTube transcript tool + """ + try: + # Extract parameters with defaults + video_input = tool_parameters['video_id'] + language = tool_parameters.get('language') + output_format = tool_parameters.get('format', 'text') + preserve_formatting = tool_parameters.get('preserve_formatting', False) + proxy = tool_parameters.get('proxy') + cookies = tool_parameters.get('cookies') + + # Extract video ID from URL if needed + video_id = self._extract_video_id(video_input) + + # Common kwargs for API calls + kwargs = { + 'proxies': {"https": proxy} if proxy else None, + 'cookies': cookies + } + + try: + if language: + transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, **kwargs) + try: + transcript = transcript_list.find_transcript([language]) + except: + # If requested language not found, try translating from English + transcript = transcript_list.find_transcript(['en']).translate(language) + transcript_data = transcript.fetch() + else: + transcript_data = YouTubeTranscriptApi.get_transcript( + video_id, + preserve_formatting=preserve_formatting, + **kwargs + ) + + # Format output + formatter_class = { + 'json': 'JSONFormatter', + 'pretty': 'PrettyPrintFormatter', + 'srt': 'SRTFormatter', + 'vtt': 'WebVTTFormatter' + }.get(output_format) + + if formatter_class: + from youtube_transcript_api import formatters + formatter = getattr(formatters, formatter_class)() + formatted_transcript = formatter.format_transcript(transcript_data) + else: + formatted_transcript = ' '.join(entry['text'] for entry in transcript_data) + + return self.create_text_message(text=formatted_transcript) + + except Exception as e: + return self.create_text_message( + text=f"Error getting transcript: {str(e)}" + ) + + except Exception as e: + return self.create_text_message( + text=f"Error processing request: {str(e)}" + ) + + def _extract_video_id(self, video_input: str) -> str: + """ + Extract video ID from URL or return as-is if already an ID + """ + if 'youtube.com' in video_input or 'youtu.be' in video_input: + # Parse URL + parsed_url = urlparse(video_input) + if 'youtube.com' in parsed_url.netloc: + return parse_qs(parsed_url.query)['v'][0] + else: # youtu.be + return parsed_url.path[1:] + return video_input # Assume it's already a video ID \ No newline at end of file diff --git a/api/core/tools/provider/builtin/transcript/tools/transcript.yaml b/api/core/tools/provider/builtin/transcript/tools/transcript.yaml new file mode 100644 index 0000000000..123b2f7673 --- /dev/null +++ b/api/core/tools/provider/builtin/transcript/tools/transcript.yaml @@ -0,0 +1,101 @@ +identity: + name: free_youtube_transcript + author: Tao Wang + label: + en_US: Free YouTube Transcript API + zh_Hans: 免费获取 YouTube 转录 +description: + human: + en_US: Get transcript from a YouTube video for free. + zh_Hans: 免费获取 YouTube 视频的转录文案。 + llm: A tool for retrieving transcript from YouTube videos. +parameters: + - name: video_id + type: string + required: true + label: + en_US: Video ID/URL + zh_Hans: 视频ID + human_description: + en_US: Used to define the video from which the transcript will be fetched. You can find the id in the video url. For example - https://www.youtube.com/watch?v=video_id. + zh_Hans: 您要哪条视频的转录文案?您可以在视频链接中找到id。例如 - https://www.youtube.com/watch?v=video_id。 + llm_description: Used to define the video from which the transcript will be fetched. For example - https://www.youtube.com/watch?v=video_id. + form: llm + - name: language + type: string + required: false + label: + en_US: Language Code + zh_Hans: 语言 + human_description: + en_US: Language code (e.g. 'en', 'zh') for the transcript. + zh_Hans: 字幕语言代码(如'en'、'zh')。留空则自动选择。 + llm_description: Used to set the language for transcripts. + form: form + - name: format + type: select + required: false + default: text + options: + - value: text + label: + en_US: Plain Text + zh_Hans: 纯文本 + - value: json + label: + en_US: JSON Format + zh_Hans: JSON 格式 + - value: pretty + label: + en_US: Pretty Print Format + zh_Hans: 美化格式 + - value: srt + label: + en_US: SRT Format + zh_Hans: SRT 格式 + - value: vtt + label: + en_US: WebVTT Format + zh_Hans: WebVTT 格式 + label: + en_US: Output Format + zh_Hans: 输出格式 + human_description: + en_US: Format of the transcript output + zh_Hans: 字幕输出格式 + llm_description: The format to output the transcript in. Options are text (plain text), json (raw transcript data), srt (SubRip format), or vtt (WebVTT format) + form: form + - name: preserve_formatting + type: boolean + required: false + default: false + label: + en_US: Preserve Formatting + zh_Hans: 保留格式 + human_description: + en_US: Keep HTML formatting elements like (italics) and (bold) + zh_Hans: 保留HTML格式元素,如(斜体)和(粗体) + llm_description: Whether to preserve HTML formatting elements in the transcript text + form: form + - name: proxy + type: string + required: false + label: + en_US: HTTPS Proxy + zh_Hans: HTTPS 代理 + human_description: + en_US: HTTPS proxy URL (e.g. https://user:pass@domain:port) + zh_Hans: HTTPS 代理地址(如 https://user:pass@domain:port) + llm_description: HTTPS proxy to use for the request. Format should be https://user:pass@domain:port + form: form + - name: cookies + type: string + required: false + label: + en_US: Cookies File Path + zh_Hans: Cookies 文件路径 + human_description: + en_US: Path to cookies.txt file for accessing age-restricted videos + zh_Hans: 用于访问年龄限制视频的 cookies.txt 文件路径 + llm_description: Path to a cookies.txt file containing YouTube cookies, needed for accessing age-restricted videos + form: form \ No newline at end of file diff --git a/api/core/tools/provider/builtin/transcript/transcript.py b/api/core/tools/provider/builtin/transcript/transcript.py new file mode 100644 index 0000000000..b119bc5163 --- /dev/null +++ b/api/core/tools/provider/builtin/transcript/transcript.py @@ -0,0 +1,12 @@ +from typing import Any, Dict +from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController +from core.tools.errors import ToolProviderCredentialValidationError + +from core.tools.provider.builtin.transcript.tools.transcript import YouTubeTranscriptTool + +class YouTubeTranscriptProvider(BuiltinToolProviderController): + def _validate_credentials(self, credentials: Dict[str, Any]) -> None: + """ + No credentials needed for YouTube Transcript API + """ + pass \ No newline at end of file diff --git a/api/core/tools/provider/builtin/transcript/transcript.yaml b/api/core/tools/provider/builtin/transcript/transcript.yaml new file mode 100644 index 0000000000..26924aacc3 --- /dev/null +++ b/api/core/tools/provider/builtin/transcript/transcript.yaml @@ -0,0 +1,14 @@ +identity: + author: Tao Wang + name: transcript + label: + en_US: Transcript + zh_Hans: Transcript + description: + en_US: Get transcripts from YouTube videos + zh_Hans: 获取 YouTube 视频的字幕/转录文本 + icon: icon.svg + tags: + - videos +credentials_for_provider: + # No credentials needed for this provider as the YouTube Transcript API is free \ No newline at end of file diff --git a/api/poetry.lock b/api/poetry.lock index 74c2ef5dc6..f52ba234f2 100644 --- a/api/poetry.lock +++ b/api/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -10854,6 +10854,20 @@ requests = ">=2.31" nospam = ["requests-cache (>=1.0)", "requests-ratelimiter (>=0.3.1)"] repair = ["scipy (>=1.6.3)"] +[[package]] +name = "youtube-transcript-api" +version = "0.6.2" +description = "This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles, supports translating subtitles and it does not require a headless browser, like other selenium based solutions do!" +optional = false +python-versions = "*" +files = [ + {file = "youtube_transcript_api-0.6.2-py3-none-any.whl", hash = "sha256:019dbf265c6a68a0591c513fff25ed5a116ce6525832aefdfb34d4df5567121c"}, + {file = "youtube_transcript_api-0.6.2.tar.gz", hash = "sha256:cad223d7620633cec44f657646bffc8bbc5598bd8e70b1ad2fa8277dec305eb7"}, +] + +[package.dependencies] +requests = "*" + [[package]] name = "zhipuai" version = "2.1.5.20230904" @@ -11078,4 +11092,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.13" -content-hash = "2ba4b464eebc26598f290fa94713acc44c588f902176e6efa80622911d40f0ac" +content-hash = "69a3f471f85dce9e5fb889f739e148a4a6d95aaf94081414503867c7157dba69" diff --git a/api/pyproject.toml b/api/pyproject.toml index 8def7cd8f7..0d87c1b1c8 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -187,6 +187,7 @@ websocket-client = "~1.7.0" werkzeug = "~3.0.1" xinference-client = "0.15.2" yarl = "~1.9.4" +youtube-transcript-api = "~0.6.2" zhipuai = "~2.1.5" # Before adding new dependency, consider place it in alphabet order (a-z) and suitable group.