Merge remote-tracking branch 'upstream/main' into burkeholland-vscode-install-instructions

This commit is contained in:
Burke Holland
2025-04-10 09:29:23 -05:00
16 changed files with 488 additions and 84 deletions

View File

@@ -155,6 +155,10 @@ ModelContextProtocol/1.0 (User-Specified; +https://github.com/modelcontextprotoc
This can be customized by adding the argument `--user-agent=YourUserAgent` to the `args` list in the configuration.
### Customization - Proxy
The server can be configured to use a proxy by using the `--proxy-url` argument.
## Debugging
You can use the MCP inspector to debug the server. For uvx installations:

View File

@@ -1,6 +1,6 @@
[project]
name = "mcp-server-fetch"
version = "0.6.2"
version = "0.6.3"
description = "A Model Context Protocol server providing tools to fetch and convert web content for usage by LLMs"
readme = "README.md"
requires-python = ">=3.10"
@@ -16,6 +16,7 @@ classifiers = [
"Programming Language :: Python :: 3.10",
]
dependencies = [
"httpx<0.28",
"markdownify>=0.13.1",
"mcp>=1.1.3",
"protego>=0.3.1",

View File

@@ -15,9 +15,10 @@ def main():
action="store_true",
help="Ignore robots.txt restrictions",
)
parser.add_argument("--proxy-url", type=str, help="Proxy URL to use for requests")
args = parser.parse_args()
asyncio.run(serve(args.user_agent, args.ignore_robots_txt))
asyncio.run(serve(args.user_agent, args.ignore_robots_txt, args.proxy_url))
if __name__ == "__main__":

View File

@@ -63,7 +63,7 @@ def get_robots_txt_url(url: str) -> str:
return robots_url
async def check_may_autonomously_fetch_url(url: str, user_agent: str) -> None:
async def check_may_autonomously_fetch_url(url: str, user_agent: str, proxy_url: str | None = None) -> None:
"""
Check if the URL can be fetched by the user agent according to the robots.txt file.
Raises a McpError if not.
@@ -72,7 +72,7 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str) -> None:
robot_txt_url = get_robots_txt_url(url)
async with AsyncClient() as client:
async with AsyncClient(proxies=proxy_url) as client:
try:
response = await client.get(
robot_txt_url,
@@ -109,14 +109,14 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str) -> None:
async def fetch_url(
url: str, user_agent: str, force_raw: bool = False
url: str, user_agent: str, force_raw: bool = False, proxy_url: str | None = None
) -> Tuple[str, str]:
"""
Fetch the URL and return the content in a form ready for the LLM, as well as a prefix string with status information.
"""
from httpx import AsyncClient, HTTPError
async with AsyncClient() as client:
async with AsyncClient(proxies=proxy_url) as client:
try:
response = await client.get(
url,
@@ -173,19 +173,22 @@ class Fetch(BaseModel):
bool,
Field(
default=False,
description="Get the actual HTML content if the requested page, without simplification.",
description="Get the actual HTML content of the requested page, without simplification.",
),
]
async def serve(
custom_user_agent: str | None = None, ignore_robots_txt: bool = False
custom_user_agent: str | None = None,
ignore_robots_txt: bool = False,
proxy_url: str | None = None,
) -> None:
"""Run the fetch MCP server.
Args:
custom_user_agent: Optional custom User-Agent string to use for requests
ignore_robots_txt: Whether to ignore robots.txt restrictions
proxy_url: Optional proxy URL to use for requests
"""
server = Server("mcp-fetch")
user_agent_autonomous = custom_user_agent or DEFAULT_USER_AGENT_AUTONOMOUS
@@ -229,10 +232,10 @@ Although originally you did not have internet access, and were advised to refuse
raise McpError(ErrorData(code=INVALID_PARAMS, message="URL is required"))
if not ignore_robots_txt:
await check_may_autonomously_fetch_url(url, user_agent_autonomous)
await check_may_autonomously_fetch_url(url, user_agent_autonomous, proxy_url)
content, prefix = await fetch_url(
url, user_agent_autonomous, force_raw=args.raw
url, user_agent_autonomous, force_raw=args.raw, proxy_url=proxy_url
)
original_length = len(content)
if args.start_index >= original_length:
@@ -259,7 +262,7 @@ Although originally you did not have internet access, and were advised to refuse
url = arguments["url"]
try:
content, prefix = await fetch_url(url, user_agent_manual)
content, prefix = await fetch_url(url, user_agent_manual, proxy_url=proxy_url)
# TODO: after SDK bug is addressed, don't catch the exception
except McpError as e:
return GetPromptResult(