add doc strings for readabilty and constrain types

This commit is contained in:
Jack Adamson
2024-11-29 14:54:06 +00:00
parent c820086b35
commit ea42a21078

View File

@@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Tuple
from urllib.parse import urlparse, urlunparse from urllib.parse import urlparse, urlunparse
import markdownify import markdownify
@@ -17,13 +17,21 @@ from mcp.types import (
INTERNAL_ERROR, INTERNAL_ERROR,
) )
from protego import Protego from protego import Protego
from pydantic import BaseModel, Field, ValidationError from pydantic import BaseModel, Field, AnyUrl, conint
DEFAULT_USER_AGENT_AUTONOMOUS = "ModelContextProtocol/1.0 (Autonomous; +https://github.com/modelcontextprotocol/servers)" DEFAULT_USER_AGENT_AUTONOMOUS = "ModelContextProtocol/1.0 (Autonomous; +https://github.com/modelcontextprotocol/servers)"
DEFAULT_USER_AGENT_MANUAL = "ModelContextProtocol/1.0 (User-Specified; +https://github.com/modelcontextprotocol/servers)" DEFAULT_USER_AGENT_MANUAL = "ModelContextProtocol/1.0 (User-Specified; +https://github.com/modelcontextprotocol/servers)"
def extract_content_from_html(html: str) -> str: def extract_content_from_html(html: str) -> str:
"""Extract and convert HTML content to Markdown format.
Args:
html: Raw HTML content to process
Returns:
Simplified markdown version of the content
"""
ret = readabilipy.simple_json.simple_json_from_html_string( ret = readabilipy.simple_json.simple_json_from_html_string(
html, use_readability=True html, use_readability=True
) )
@@ -36,9 +44,17 @@ def extract_content_from_html(html: str) -> str:
return content return content
def get_robots_txt_url(url: str) -> str: def get_robots_txt_url(url: AnyUrl | str) -> str:
"""Get the robots.txt URL for a given website URL.
Args:
url: Website URL to get robots.txt for
Returns:
URL of the robots.txt file
"""
# Parse the URL into components # Parse the URL into components
parsed = urlparse(url) parsed = urlparse(str(url))
# Reconstruct the base URL with just scheme, netloc, and /robots.txt path # Reconstruct the base URL with just scheme, netloc, and /robots.txt path
robots_url = urlunparse((parsed.scheme, parsed.netloc, "/robots.txt", "", "", "")) robots_url = urlunparse((parsed.scheme, parsed.netloc, "/robots.txt", "", "", ""))
@@ -46,7 +62,7 @@ def get_robots_txt_url(url: str) -> str:
return robots_url return robots_url
async def check_may_autonomously_fetch_url(url: str, user_agent: str): async def check_may_autonomously_fetch_url(url: AnyUrl | str, user_agent: str) -> None:
""" """
Check if the URL can be fetched by the user agent according to the robots.txt file. Check if the URL can be fetched by the user agent according to the robots.txt file.
Raises a McpError if not. Raises a McpError if not.
@@ -89,7 +105,9 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str):
) )
async def fetch_url(url: str, user_agent: str, force_raw: bool = False) -> (str, str): async def fetch_url(
url: AnyUrl | str, user_agent: str, force_raw: bool = False
) -> Tuple[str, str]:
""" """
Fetch the URL and return the content in a form ready for the LLM, as well as a prefix string with status information. Fetch the URL and return the content in a form ready for the LLM, as well as a prefix string with status information.
""" """
@@ -98,7 +116,7 @@ async def fetch_url(url: str, user_agent: str, force_raw: bool = False) -> (str,
async with AsyncClient() as client: async with AsyncClient() as client:
try: try:
response = await client.get( response = await client.get(
url, str(url),
follow_redirects=True, follow_redirects=True,
headers={"User-Agent": user_agent}, headers={"User-Agent": user_agent},
timeout=30, timeout=30,
@@ -128,9 +146,13 @@ async def fetch_url(url: str, user_agent: str, force_raw: bool = False) -> (str,
class Fetch(BaseModel): class Fetch(BaseModel):
url: str = Field(..., description="URL to fetch") """Parameters for fetching a URL."""
max_length: int = Field(5000, description="Maximum number of characters to return.")
start_index: int = Field( url: AnyUrl = Field(..., description="URL to fetch")
max_length: conint(gt=0, lt=1000000) = Field(
5000, description="Maximum number of characters to return."
)
start_index: conint(ge=0) = Field(
0, 0,
description="On return output starting at this character index, useful if a previous fetch was truncated and more context is required.", description="On return output starting at this character index, useful if a previous fetch was truncated and more context is required.",
) )
@@ -143,6 +165,12 @@ class Fetch(BaseModel):
async def serve( async def serve(
custom_user_agent: Optional[str] = None, ignore_robots_txt: bool = False custom_user_agent: Optional[str] = None, ignore_robots_txt: bool = False
) -> None: ) -> None:
"""Run the fetch MCP server.
Args:
custom_user_agent: Optional custom User-Agent string to use for requests
ignore_robots_txt: Whether to ignore robots.txt restrictions
"""
server = Server("mcp-fetch") server = Server("mcp-fetch")
user_agent_autonomous = custom_user_agent or DEFAULT_USER_AGENT_AUTONOMOUS user_agent_autonomous = custom_user_agent or DEFAULT_USER_AGENT_AUTONOMOUS
user_agent_manual = custom_user_agent or DEFAULT_USER_AGENT_MANUAL user_agent_manual = custom_user_agent or DEFAULT_USER_AGENT_MANUAL