Files
ollama/app/tools/browser_crawl.go
2025-10-23 11:27:41 -07:00

197 lines
5.3 KiB
Go

package tools
import (
"bytes"
"context"
"encoding/json"
"fmt"
"net/http"
"net/url"
"strconv"
"time"
"github.com/ollama/ollama/auth"
)
// CrawlContent represents the content of a crawled page
type CrawlContent struct {
Snippet string `json:"snippet"`
FullText string `json:"full_text"`
}
// CrawlExtras represents additional data from the crawl API
type CrawlExtras struct {
Links []CrawlLink `json:"links"`
}
// CrawlLink represents a link found on a crawled page
type CrawlLink struct {
URL string `json:"url"`
Href string `json:"href"`
Text string `json:"text"`
}
// CrawlResult represents a single crawl result
type CrawlResult struct {
Title string `json:"title"`
URL string `json:"url"`
Content CrawlContent `json:"content"`
Extras CrawlExtras `json:"extras"`
}
// CrawlResponse represents the complete response from the crawl API
type CrawlResponse struct {
Results map[string][]CrawlResult `json:"results"`
}
// BrowserCrawler tool for crawling web pages using ollama.com crawl API
type BrowserCrawler struct{}
func (g *BrowserCrawler) Name() string {
return "get_webpage"
}
func (g *BrowserCrawler) Description() string {
return "Crawl and extract text content from web pages"
}
func (g *BrowserCrawler) Prompt() string {
return `When you need to read content from web pages, use the get_webpage tool. Simply provide the URLs you want to read and I'll fetch their content for you.
For each URL, I'll extract the main text content in a readable format. If you need to discover links within those pages, set extract_links to true. If the user requires the latest information, set livecrawl to true.
Only use this tool when you need to access current web content. Make sure the URLs are valid and accessible. Do not use this tool for:
- Downloading files or media
- Accessing private/authenticated pages
- Scraping data at high volumes
Always check the returned content to ensure it's relevant before using it in your response.`
}
func (g *BrowserCrawler) Schema() map[string]any {
schemaBytes := []byte(`{
"type": "object",
"properties": {
"urls": {
"type": "array",
"items": {
"type": "string"
},
"description": "List of URLs to crawl and extract content from"
},
"latest": {
"type": "boolean",
"description": " Needs up to date and latest information (default: false)",
"default": false
}
},
"required": ["urls"]
}`)
var schema map[string]any
if err := json.Unmarshal(schemaBytes, &schema); err != nil {
return nil
}
return schema
}
func (g *BrowserCrawler) Execute(ctx context.Context, args map[string]any) (*CrawlResponse, error) {
// Extract and validate URLs
urlsRaw, ok := args["urls"].([]any)
if !ok {
return nil, fmt.Errorf("urls parameter is required and must be an array of strings")
}
urls := make([]string, 0, len(urlsRaw))
for _, u := range urlsRaw {
if urlStr, ok := u.(string); ok {
urls = append(urls, urlStr)
}
}
if len(urls) == 0 {
return nil, fmt.Errorf("at least one URL is required")
}
latest, _ := args["latest"].(bool)
// Perform the web crawling
return g.performWebCrawl(ctx, urls, latest)
}
// performWebCrawl handles the actual HTTP request to ollama.com crawl API
func (g *BrowserCrawler) performWebCrawl(ctx context.Context, urls []string, latest bool) (*CrawlResponse, error) {
// Prepare the request body matching the API format
reqBody := map[string]any{
"urls": urls,
"text": true,
"extras": map[string]any{
"links": 1,
},
"livecrawl": "fallback",
}
if latest {
reqBody["livecrawl"] = "always"
}
jsonBody, err := json.Marshal(reqBody)
if err != nil {
return nil, fmt.Errorf("failed to marshal request body: %w", err)
}
crawlURL, err := url.Parse("https://ollama.com/api/tools/webcrawl")
if err != nil {
return nil, fmt.Errorf("failed to parse crawl URL: %w", err)
}
// Add timestamp for signing
query := crawlURL.Query()
query.Add("ts", strconv.FormatInt(time.Now().Unix(), 10))
var signature string
crawlURL.RawQuery = query.Encode()
// Sign the request data (method + URI)
data := fmt.Appendf(nil, "%s,%s", http.MethodPost, crawlURL.RequestURI())
signature, err = auth.Sign(ctx, data)
if err != nil {
return nil, fmt.Errorf("failed to sign request: %w", err)
}
// Create the request
req, err := http.NewRequestWithContext(ctx, "POST", crawlURL.String(), bytes.NewBuffer(jsonBody))
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
// Set headers
req.Header.Set("Content-Type", "application/json")
if signature != "" {
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", signature))
}
// Make the request
resp, err := http.DefaultClient.Do(req)
if err != nil {
return nil, fmt.Errorf("failed to execute crawl request: %w", err)
}
defer resp.Body.Close()
// Read and parse response
var result CrawlResponse
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return nil, fmt.Errorf("failed to decode response: %w", err)
}
// Check for error response
if resp.StatusCode != http.StatusOK {
errMsg := "unknown error"
if resp.StatusCode == http.StatusServiceUnavailable {
errMsg = "crawl service unavailable - API key may not be configured"
}
return nil, fmt.Errorf("crawl API error (status %d): %s", resp.StatusCode, errMsg)
}
return &result, nil
}