diff options
| -rw-r--r-- | README.md | 34 | ||||
| -rw-r--r-- | pi/agent/extensions/web-search/README.md | 42 | ||||
| -rw-r--r-- | pi/agent/extensions/web-search/index.ts | 248 |
3 files changed, 324 insertions, 0 deletions
@@ -204,6 +204,40 @@ model switch hyperstack1/openai/gpt-oss-120b Pi sends subsequent requests to the new model ID immediately; the provider base URL stays the same. +## Extensions + +Custom extensions live in `pi/agent/extensions/` and are loaded automatically via the `~/.pi` symlink. + +| Extension | Purpose | +|-----------|---------| +| `web-search` | `web_search` and `web_fetch` tools — DuckDuckGo search + page fetching, no API key | +| `ask-mode` | `/ask` command — restricts the model to read-only exploration tools | +| `loop-scheduler` | `/loop` command — re-sends a prompt on a recurring interval | +| `inline-bash` | `!{cmd}` syntax — expands shell output inline before sending to the model | +| `session-name` | Auto-names sessions from the first message | +| `modal-editor` | Opens an external editor (`$VISUAL`) for composing long prompts | +| `handoff` | Compacts and hands off context to a fresh session | +| `fresh-subagent` | Spawns a sub-agent in a clean context for isolated tasks | +| `reload-runtime` | `/reload-runtime` command — hot-reloads extensions without restarting Pi | +| `nemotron-tool-repair` | Repairs malformed tool calls from Nemotron models | +| `taskwarrior-plan-mode` | Integrates Taskwarrior task management into Pi sessions | + +### Web search + +The `web-search` extension registers two LLM-callable tools: + +- **`web_search`** — searches DuckDuckGo and returns up to 8 results (title, URL, snippet) +- **`web_fetch`** — fetches a URL and returns up to 12,000 characters of readable text + +Example prompts: + +``` +Search for the vLLM 0.9.0 changelog +Find the Qwen3-Coder model card and summarize the recommended vLLM flags +``` + +No API key or account required. Uses DuckDuckGo's free HTML endpoint. + ## Single-VM setup A single VM can be deployed with the default config (GPT-OSS 120B): diff --git a/pi/agent/extensions/web-search/README.md b/pi/agent/extensions/web-search/README.md new file mode 100644 index 0000000..c7b77b3 --- /dev/null +++ b/pi/agent/extensions/web-search/README.md @@ -0,0 +1,42 @@ +# web-search + +Pi.dev extension that gives the LLM two tools for consulting the web during coding sessions. + +## Tools + +### `web_search` + +Searches DuckDuckGo (no API key, no account required) and returns up to 8 results with +titles, URLs, and snippets. Use when the model needs current documentation, release notes, +library APIs, or any information not in its training data. + +### `web_fetch` + +Fetches the full text of a URL. Strips `<script>`, `<style>`, `<nav>`, `<header>`, and +`<footer>` blocks, collapses whitespace, and truncates to 12,000 characters. Use after +`web_search` to read the complete content of a result page. + +## Usage + +The tools are registered automatically when the extension loads. Just ask the model to +look something up: + +``` +Search for the vLLM changelog for version 0.9.0 +``` + +``` +Find the Qwen3 model card on HuggingFace and summarize the recommended vLLM flags +``` + +## Backend + +Uses DuckDuckGo's free HTML endpoint (`https://html.duckduckgo.com/html/`). No API key, +no rate-limit registration, and no personally identifying headers are sent. HTTP requests +time out after 15 seconds. + +## Limitations + +- DuckDuckGo HTML scraping may break if DDG changes its page structure. +- Pages that require JavaScript rendering return little or no content. +- Results are in English (`kl=us-en`). diff --git a/pi/agent/extensions/web-search/index.ts b/pi/agent/extensions/web-search/index.ts new file mode 100644 index 0000000..8c7b532 --- /dev/null +++ b/pi/agent/extensions/web-search/index.ts @@ -0,0 +1,248 @@ +import * as https from "node:https"; +import * as http from "node:http"; +import { URL } from "node:url"; +import { Type } from "@sinclair/typebox"; +import type { ExtensionAPI } from "@mariozechner/pi-coding-agent"; + +// Maximum number of search results to return per query. +const MAX_RESULTS = 8; + +// Maximum characters to include from a fetched page. +const MAX_PAGE_CHARS = 12000; + +// Timeout in milliseconds for HTTP requests. +const REQUEST_TIMEOUT_MS = 15000; + +interface SearchResult { + title: string; + url: string; + snippet: string; +} + +/** + * Fetch a URL and return the response body as a string. + * Follows a single redirect. Rejects on timeout or non-2xx status. + */ +function fetchUrl(url: string, extraHeaders: Record<string, string> = {}): Promise<string> { + return new Promise((resolve, reject) => { + const parsed = new URL(url); + const transport = parsed.protocol === "https:" ? https : http; + + const options = { + hostname: parsed.hostname, + port: parsed.port || (parsed.protocol === "https:" ? 443 : 80), + path: parsed.pathname + parsed.search, + method: "GET", + headers: { + "User-Agent": + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", + Accept: "text/html,application/xhtml+xml,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", + ...extraHeaders, + }, + }; + + const req = transport.request(options, (res) => { + // Follow a single redirect. + if ( + res.statusCode && + res.statusCode >= 300 && + res.statusCode < 400 && + res.headers.location + ) { + fetchUrl(res.headers.location, extraHeaders).then(resolve, reject); + res.resume(); + return; + } + + if (!res.statusCode || res.statusCode < 200 || res.statusCode >= 300) { + reject(new Error(`HTTP ${res.statusCode} for ${url}`)); + res.resume(); + return; + } + + const chunks: Buffer[] = []; + res.on("data", (chunk: Buffer) => chunks.push(chunk)); + res.on("end", () => resolve(Buffer.concat(chunks).toString("utf-8"))); + res.on("error", reject); + }); + + req.setTimeout(REQUEST_TIMEOUT_MS, () => { + req.destroy(); + reject(new Error(`Timeout fetching ${url}`)); + }); + + req.on("error", reject); + req.end(); + }); +} + +/** + * Search DuckDuckGo using the HTML interface (no API key required). + * Parses result titles, URLs, and snippets from the response HTML. + */ +async function searchDuckDuckGo(query: string): Promise<SearchResult[]> { + const params = new URLSearchParams({ q: query, kl: "us-en" }); + const html = await fetchUrl(`https://html.duckduckgo.com/html/?${params}`, { + // DuckDuckGo HTML endpoint requires an Accept header to avoid redirects. + Accept: "text/html", + }); + + const results: SearchResult[] = []; + + // Each result block looks like: + // <div class="result__body"> + // <a class="result__a" href="...">Title</a> + // <a class="result__snippet">Snippet text</a> + // </div> + // The href on result__a is a DDG redirect; the real URL is in the href + // query param `uddg=`. + const resultBlockRe = /<div class="result__body"[\s\S]*?(?=<div class="result__body"|<\/div><!--end-results-->)/g; + const titleRe = /<a[^>]*class="result__a"[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/; + const snippetRe = /<a[^>]*class="result__snippet"[^>]*>([\s\S]*?)<\/a>/; + + let block: RegExpExecArray | null; + while ((block = resultBlockRe.exec(html)) !== null && results.length < MAX_RESULTS) { + const blockHtml = block[0]; + + const titleMatch = titleRe.exec(blockHtml); + if (!titleMatch) continue; + + const rawHref = titleMatch[1]; + const rawTitle = titleMatch[2].replace(/<[^>]+>/g, "").trim(); + + // Resolve the real URL from the DDG redirect link. + let realUrl = rawHref; + try { + const hrefUrl = new URL(rawHref.startsWith("//") ? `https:${rawHref}` : rawHref); + const uddg = hrefUrl.searchParams.get("uddg"); + if (uddg) realUrl = decodeURIComponent(uddg); + } catch { + // Keep rawHref if URL parsing fails. + } + + const snippetMatch = snippetRe.exec(blockHtml); + const rawSnippet = snippetMatch + ? snippetMatch[1].replace(/<[^>]+>/g, "").trim() + : ""; + + if (!rawTitle && !rawSnippet) continue; + + results.push({ + title: decodeHtmlEntities(rawTitle), + url: realUrl, + snippet: decodeHtmlEntities(rawSnippet), + }); + } + + return results; +} + +/** Decode common HTML entities in search result text. */ +function decodeHtmlEntities(text: string): string { + return text + .replace(/&/g, "&") + .replace(/</g, "<") + .replace(/>/g, ">") + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/ /g, " ") + .replace(/'/g, "'") + .replace(///g, "/"); +} + +/** + * Fetch a web page and extract its readable text content. + * Strips HTML tags, collapses whitespace, and truncates to MAX_PAGE_CHARS. + */ +async function fetchPage(url: string): Promise<string> { + const html = await fetchUrl(url); + + // Remove script, style, and nav blocks before stripping tags. + const cleaned = html + .replace(/<script[\s\S]*?<\/script>/gi, " ") + .replace(/<style[\s\S]*?<\/style>/gi, " ") + .replace(/<nav[\s\S]*?<\/nav>/gi, " ") + .replace(/<header[\s\S]*?<\/header>/gi, " ") + .replace(/<footer[\s\S]*?<\/footer>/gi, " ") + .replace(/<[^>]+>/g, " ") + .replace(/\s{2,}/g, " ") + .trim(); + + if (cleaned.length <= MAX_PAGE_CHARS) return cleaned; + return cleaned.slice(0, MAX_PAGE_CHARS) + `\n\n[... truncated at ${MAX_PAGE_CHARS} chars]`; +} + +/** Format search results as plain text for the LLM. */ +function formatResults(results: SearchResult[]): string { + if (results.length === 0) return "No results found."; + return results + .map( + (r, i) => + `${i + 1}. ${r.title}\n URL: ${r.url}\n ${r.snippet}`, + ) + .join("\n\n"); +} + +export default function webSearchExtension(pi: ExtensionAPI): void { + // Tool: search the web and return a list of results with titles and snippets. + pi.registerTool({ + name: "web_search", + label: "Web Search", + description: + "Search the web using DuckDuckGo (no API key required). Returns up to 8 results with titles, URLs, and snippets. Use this when you need current information, documentation, or anything not in your training data.", + promptSnippet: "Search the web for current information", + parameters: Type.Object({ + query: Type.String({ + description: "The search query to look up on DuckDuckGo", + }), + }), + async execute(_toolCallId, params, _signal) { + try { + const results = await searchDuckDuckGo(params.query); + return { + content: [{ type: "text", text: formatResults(results) }], + details: { query: params.query, resultCount: results.length, results }, + }; + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + return { + content: [{ type: "text", text: `Search failed: ${msg}` }], + details: { query: params.query, error: msg }, + isError: true, + }; + } + }, + }); + + // Tool: fetch a specific URL and return its text content. + // Useful after a web_search to read the full content of a result. + pi.registerTool({ + name: "web_fetch", + label: "Web Fetch", + description: + "Fetch the text content of a specific URL. Use after web_search to read the full content of a result page. Returns up to 12,000 characters of readable text.", + promptSnippet: "Fetch and read a specific URL", + parameters: Type.Object({ + url: Type.String({ + description: "The full URL to fetch (must start with http:// or https://)", + }), + }), + async execute(_toolCallId, params, _signal) { + try { + const text = await fetchPage(params.url); + return { + content: [{ type: "text", text }], + details: { url: params.url, length: text.length }, + }; + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + return { + content: [{ type: "text", text: `Fetch failed: ${msg}` }], + details: { url: params.url, error: msg }, + isError: true, + }; + } + }, + }); +} |
