From fa6efc4b9417dc76aaefcf83fccf3d0bb0dc95d9 Mon Sep 17 00:00:00 2001 From: Ashpreet Date: Fri, 10 Apr 2026 12:44:34 +0100 Subject: [PATCH 01/23] feat: add LLMsTxtReader and LLMsTxtTools for llms.txt support Add a reader and toolkit for the llms.txt standard (https://llmstxt.org), enabling agents to discover and consume documentation indexes. LLMsTxtReader: fetches an llms.txt URL, parses the standardized markdown format to extract all linked doc URLs, fetches page content (handling HTML, markdown, plain text), and returns Documents with section/title metadata. Async variant fetches all pages concurrently. LLMsTxtTools provides two modes: - Agentic: get_llms_txt_index returns the index so the agent picks which pages to read, then read_llms_txt_url fetches individual pages. - Knowledge: read_llms_txt_and_load_knowledge bulk-fetches all linked pages and inserts them into a Knowledge base. Includes 32 unit tests and 2 cookbook examples. Co-Authored-By: Claude Opus 4.6 (1M context) --- cookbook/91_tools/llms_txt_tools.py | 42 ++ cookbook/91_tools/llms_txt_tools_knowledge.py | 56 +++ .../agno/knowledge/reader/llms_txt_reader.py | 361 ++++++++++++++++ .../agno/knowledge/reader/reader_factory.py | 17 + libs/agno/agno/tools/llms_txt.py | 161 +++++++ libs/agno/tests/unit/tools/test_llms_txt.py | 398 ++++++++++++++++++ 6 files changed, 1035 insertions(+) create mode 100644 cookbook/91_tools/llms_txt_tools.py create mode 100644 cookbook/91_tools/llms_txt_tools_knowledge.py create mode 100644 libs/agno/agno/knowledge/reader/llms_txt_reader.py create mode 100644 libs/agno/agno/tools/llms_txt.py create mode 100644 libs/agno/tests/unit/tools/test_llms_txt.py diff --git a/cookbook/91_tools/llms_txt_tools.py b/cookbook/91_tools/llms_txt_tools.py new file mode 100644 index 0000000000..cb2379c2ec --- /dev/null +++ b/cookbook/91_tools/llms_txt_tools.py @@ -0,0 +1,42 @@ +""" +LLMs.txt Tools - Agentic Documentation Discovery +============================= + +Demonstrates how to use LLMsTxtTools in agentic mode where the agent: +1. Reads the llms.txt index to discover available documentation pages +2. Decides which pages are relevant to the user's question +3. Fetches only the specific pages it needs + +The llms.txt format (https://llmstxt.org) is a standardized way for websites +to provide LLM-friendly documentation indexes. +""" + +from agno.agent import Agent +from agno.models.openai import OpenAIResponses +from agno.tools.llms_txt import LLMsTxtTools + +# --------------------------------------------------------------------------- +# Create Agent +# --------------------------------------------------------------------------- + +agent = Agent( + model=OpenAIResponses(id="gpt-5.4"), + tools=[LLMsTxtTools()], + instructions=[ + "You can read llms.txt files to discover documentation for any project.", + "First use get_llms_txt_index to see what pages are available.", + "Then use read_llms_txt_url to fetch only the pages relevant to the user's question.", + ], + markdown=True, +) + +# --------------------------------------------------------------------------- +# Run Agent +# --------------------------------------------------------------------------- +if __name__ == "__main__": + agent.print_response( + "Using the llms.txt at https://docs.agno.com/llms.txt, " + "find and read the documentation about how to create an agent with tools", + markdown=True, + stream=True, + ) diff --git a/cookbook/91_tools/llms_txt_tools_knowledge.py b/cookbook/91_tools/llms_txt_tools_knowledge.py new file mode 100644 index 0000000000..ce1c131f99 --- /dev/null +++ b/cookbook/91_tools/llms_txt_tools_knowledge.py @@ -0,0 +1,56 @@ +""" +LLMs.txt Tools with Knowledge Base +============================= + +Demonstrates loading all documentation from an llms.txt file into a knowledge base +for retrieval-augmented generation (RAG). + +The agent reads the llms.txt index, fetches all linked documentation pages, +and stores them in a PgVector knowledge base for semantic search. +""" + +from agno.agent import Agent +from agno.knowledge.knowledge import Knowledge +from agno.models.openai import OpenAIResponses +from agno.tools.llms_txt import LLMsTxtTools +from agno.vectordb.pgvector import PgVector + +# --------------------------------------------------------------------------- +# Setup Knowledge Base +# --------------------------------------------------------------------------- + +db_url = "postgresql+psycopg://ai:ai@localhost:5532/ai" + +knowledge = Knowledge( + vector_db=PgVector( + table_name="llms_txt_docs", + db_url=db_url, + ), +) + +# --------------------------------------------------------------------------- +# Create Agent +# --------------------------------------------------------------------------- + +agent = Agent( + model=OpenAIResponses(id="gpt-5.4"), + knowledge=knowledge, + search_knowledge=True, + tools=[LLMsTxtTools(knowledge=knowledge, max_urls=20)], + instructions=[ + "You can load documentation from llms.txt files into your knowledge base.", + "When asked about a project, first load its llms.txt into the knowledge base, then answer questions.", + ], + markdown=True, +) + +# --------------------------------------------------------------------------- +# Run Agent +# --------------------------------------------------------------------------- +if __name__ == "__main__": + agent.print_response( + "Load the documentation from https://docs.agno.com/llms.txt into the knowledge base, " + "then tell me how to create an agent with Agno", + markdown=True, + stream=True, + ) diff --git a/libs/agno/agno/knowledge/reader/llms_txt_reader.py b/libs/agno/agno/knowledge/reader/llms_txt_reader.py new file mode 100644 index 0000000000..66e2256336 --- /dev/null +++ b/libs/agno/agno/knowledge/reader/llms_txt_reader.py @@ -0,0 +1,361 @@ +import asyncio +import re +import uuid +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple +from urllib.parse import urljoin + +import httpx + +from agno.knowledge.chunking.fixed import FixedSizeChunking +from agno.knowledge.chunking.strategy import ChunkingStrategy, ChunkingStrategyType +from agno.knowledge.document.base import Document +from agno.knowledge.reader.base import Reader +from agno.knowledge.types import ContentType +from agno.utils.log import log_debug, log_error, log_warning + +try: + from bs4 import BeautifulSoup +except ImportError: + raise ImportError("The `bs4` package is not installed. Please install it via `pip install beautifulsoup4`.") + + +# Pattern to match markdown links: - [Title](url) or - [Title](url): description +_LINK_PATTERN = re.compile(r"-\s+\[([^\]]+)\]\(([^)]+)\)(?::\s*(.+))?") +# Pattern to match H2 section headers +_SECTION_PATTERN = re.compile(r"^##\s+(.+)$", re.MULTILINE) + + +@dataclass +class LLMsTxtEntry: + """A single entry parsed from an llms.txt file.""" + + title: str + url: str + description: str + section: str + + +class LLMsTxtReader(Reader): + """Reader for llms.txt files. + + Reads an llms.txt file (see https://llmstxt.org), parses all linked documentation URLs, + fetches the content of each linked page, and returns them as Documents. + + The llms.txt format is a standardized markdown file with: + - An H1 heading (project name) + - An optional blockquote summary + - H2-delimited sections containing markdown links to documentation pages + + Example: + reader = LLMsTxtReader(max_urls=50) + documents = reader.read("https://docs.example.com/llms.txt") + """ + + def __init__( + self, + chunking_strategy: Optional[ChunkingStrategy] = None, + max_urls: int = 100, + timeout: int = 30, + proxy: Optional[str] = None, + include_llms_txt_content: bool = True, + skip_optional: bool = False, + **kwargs, + ): + """Initialize the LLMsTxtReader. + + Args: + chunking_strategy: Strategy for chunking documents. + max_urls: Maximum number of linked URLs to fetch. Defaults to 100. + timeout: HTTP request timeout in seconds. Defaults to 30. + proxy: Optional HTTP proxy URL. + include_llms_txt_content: Whether to include the llms.txt file itself as a document. + skip_optional: Whether to skip URLs in the "Optional" section. + """ + if chunking_strategy is None: + chunk_size = kwargs.get("chunk_size", 5000) + chunking_strategy = FixedSizeChunking(chunk_size=chunk_size) + super().__init__(chunking_strategy=chunking_strategy, **kwargs) + self.max_urls = max_urls + self.timeout = timeout + self.proxy = proxy + self.include_llms_txt_content = include_llms_txt_content + self.skip_optional = skip_optional + + @classmethod + def get_supported_chunking_strategies(cls) -> List[ChunkingStrategyType]: + return [ + ChunkingStrategyType.FIXED_SIZE_CHUNKER, + ChunkingStrategyType.AGENTIC_CHUNKER, + ChunkingStrategyType.DOCUMENT_CHUNKER, + ChunkingStrategyType.RECURSIVE_CHUNKER, + ChunkingStrategyType.SEMANTIC_CHUNKER, + ] + + @classmethod + def get_supported_content_types(cls) -> List[ContentType]: + return [ContentType.URL] + + def _parse_llms_txt(self, content: str, base_url: str) -> Tuple[str, List[LLMsTxtEntry]]: + """Parse an llms.txt file and extract all linked URLs. + + Args: + content: The raw text content of the llms.txt file. + base_url: The base URL for resolving relative links. + + Returns: + A tuple of (overview text, list of LLMsTxtEntry). + """ + entries: List[LLMsTxtEntry] = [] + current_section = "" + in_optional = False + + lines = content.split("\n") + overview_lines: List[str] = [] + past_first_section = False + + for line in lines: + # Check for section headers + section_match = _SECTION_PATTERN.match(line) + if section_match: + current_section = section_match.group(1).strip() + past_first_section = True + in_optional = current_section.lower() == "optional" + continue + + if not past_first_section: + overview_lines.append(line) + continue + + if self.skip_optional and in_optional: + continue + + # Check for links + link_match = _LINK_PATTERN.match(line.strip()) + if link_match: + title = link_match.group(1).strip() + url = link_match.group(2).strip() + description = (link_match.group(3) or "").strip() + + # Resolve relative URLs + if not url.startswith(("http://", "https://")): + url = urljoin(base_url, url) + + entries.append( + LLMsTxtEntry( + title=title, + url=url, + description=description, + section=current_section, + ) + ) + + overview = "\n".join(overview_lines).strip() + return overview, entries + + def _extract_content(self, html: str) -> str: + """Extract readable text content from HTML.""" + soup = BeautifulSoup(html, "html.parser") + + # Remove unwanted elements + for tag in soup.find_all(["script", "style", "nav", "header", "footer", "aside"]): + tag.decompose() + + # Try to find main content + main = soup.find("main") or soup.find("article") or soup.find(attrs={"role": "main"}) + if main: + return main.get_text(strip=True, separator=" ") + + body = soup.find("body") + if body: + return body.get_text(strip=True, separator=" ") + + return soup.get_text(strip=True, separator=" ") + + def _fetch_url(self, url: str) -> Optional[str]: + """Fetch content from a URL, returning text for text-like content or extracted text from HTML.""" + try: + log_debug(f"Fetching: {url}") + if self.proxy: + response = httpx.get(url, timeout=self.timeout, proxy=self.proxy, follow_redirects=True) + else: + response = httpx.get(url, timeout=self.timeout, follow_redirects=True) + response.raise_for_status() + + content_type = response.headers.get("content-type", "") + text = response.text + + # If content is plain text or markdown, return as-is + if any(t in content_type for t in ["text/plain", "text/markdown"]): + return text + + # If content is HTML, extract the text + if "text/html" in content_type or text.strip().startswith((" Optional[str]: + """Asynchronously fetch content from a URL.""" + try: + log_debug(f"Fetching asynchronously: {url}") + response = await client.get(url, timeout=self.timeout, follow_redirects=True) + response.raise_for_status() + + content_type = response.headers.get("content-type", "") + text = response.text + + if any(t in content_type for t in ["text/plain", "text/markdown"]): + return text + + if "text/html" in content_type or text.strip().startswith((" List[Document]: + """Build Document list from fetched content.""" + documents: List[Document] = [] + + # Optionally include the llms.txt overview as a document + if self.include_llms_txt_content and overview: + doc = Document( + name=name or llms_txt_url, + id=str(uuid.uuid4()), + meta_data={ + "url": llms_txt_url, + "type": "llms_txt_overview", + }, + content=overview, + ) + if self.chunk: + documents.extend(self.chunk_document(doc)) + else: + documents.append(doc) + + # Add each fetched page as a document + for entry in entries: + content = fetched.get(entry.url) + if not content: + continue + + doc = Document( + name=entry.title, + id=str(uuid.uuid4()), + meta_data={ + "url": entry.url, + "section": entry.section, + "description": entry.description, + "type": "llms_txt_linked_doc", + }, + content=content, + ) + if self.chunk: + documents.extend(self.chunk_document(doc)) + else: + documents.append(doc) + + return documents + + def read(self, url: str, name: Optional[str] = None) -> List[Document]: + """Read an llms.txt file and all its linked documentation. + + Args: + url: The URL of the llms.txt file. + name: Optional name for the documents. + + Returns: + A list of documents from the llms.txt and all linked pages. + """ + log_debug(f"Reading llms.txt: {url}") + + # Fetch the llms.txt file + llms_txt_content = self._fetch_url(url) + if not llms_txt_content: + log_error(f"Failed to fetch llms.txt from {url}") + return [] + + # Parse the llms.txt content + overview, entries = self._parse_llms_txt(llms_txt_content, url) + log_debug(f"Found {len(entries)} linked URLs in llms.txt") + + # Limit the number of URLs to fetch + entries_to_fetch = entries[: self.max_urls] + if len(entries) > self.max_urls: + log_warning(f"Limiting to {self.max_urls} URLs (found {len(entries)})") + + # Fetch all linked pages + fetched: Dict[str, str] = {} + for entry in entries_to_fetch: + content = self._fetch_url(entry.url) + if content: + fetched[entry.url] = content + + log_debug(f"Successfully fetched {len(fetched)}/{len(entries_to_fetch)} linked pages") + return self._build_documents(overview, entries_to_fetch, fetched, url, name) + + async def async_read(self, url: str, name: Optional[str] = None) -> List[Document]: + """Asynchronously read an llms.txt file and all its linked documentation. + + Args: + url: The URL of the llms.txt file. + name: Optional name for the documents. + + Returns: + A list of documents from the llms.txt and all linked pages. + """ + log_debug(f"Reading llms.txt asynchronously: {url}") + + client_args = {"proxy": self.proxy} if self.proxy else {} + async with httpx.AsyncClient(**client_args) as client: # type: ignore + # Fetch the llms.txt file + llms_txt_content = await self._async_fetch_url(client, url) + if not llms_txt_content: + log_error(f"Failed to fetch llms.txt from {url}") + return [] + + # Parse the llms.txt content + overview, entries = self._parse_llms_txt(llms_txt_content, url) + log_debug(f"Found {len(entries)} linked URLs in llms.txt") + + # Limit the number of URLs to fetch + entries_to_fetch = entries[: self.max_urls] + if len(entries) > self.max_urls: + log_warning(f"Limiting to {self.max_urls} URLs (found {len(entries)})") + + # Fetch all linked pages concurrently + async def _fetch_entry(entry: LLMsTxtEntry) -> Tuple[str, Optional[str]]: + content = await self._async_fetch_url(client, entry.url) + return entry.url, content + + results = await asyncio.gather(*[_fetch_entry(e) for e in entries_to_fetch]) + fetched: Dict[str, str] = {url: content for url, content in results if content} + + log_debug(f"Successfully fetched {len(fetched)}/{len(entries_to_fetch)} linked pages") + return self._build_documents(overview, entries_to_fetch, fetched, url, name) diff --git a/libs/agno/agno/knowledge/reader/reader_factory.py b/libs/agno/agno/knowledge/reader/reader_factory.py index 92548f4df0..a5aefa3bd0 100644 --- a/libs/agno/agno/knowledge/reader/reader_factory.py +++ b/libs/agno/agno/knowledge/reader/reader_factory.py @@ -76,6 +76,10 @@ class ReaderFactory: "name": "WebSearchReader", "description": "Executes web searches and processes results with relevance ranking and content extraction", }, + "llms_txt": { + "name": "LLMsTxtReader", + "description": "Reads llms.txt files, discovers linked documentation URLs, and fetches their content", + }, "docling": { "name": "DoclingReader", "description": "Converts multiple document formats like PDF, DOCX, PPTX, images, HTML, etc. using IBM's Docling library", @@ -279,6 +283,18 @@ def _get_web_search_reader(cls, **kwargs) -> Reader: config.update(kwargs) return WebSearchReader(**config) + @classmethod + def _get_llms_txt_reader(cls, **kwargs) -> Reader: + """Get LLMs Text reader instance.""" + from agno.knowledge.reader.llms_txt_reader import LLMsTxtReader + + config: Dict[str, Any] = { + "name": "LLMs Text Reader", + "description": "Reads llms.txt files, discovers linked documentation URLs, and fetches their content", + } + config.update(kwargs) + return LLMsTxtReader(**config) + @classmethod def _get_docling_reader(cls, **kwargs) -> Reader: """Get Docling reader instance.""" @@ -334,6 +350,7 @@ def get_reader_class(cls, reader_key: str) -> type: "arxiv": ("agno.knowledge.reader.arxiv_reader", "ArxivReader"), "wikipedia": ("agno.knowledge.reader.wikipedia_reader", "WikipediaReader"), "web_search": ("agno.knowledge.reader.web_search_reader", "WebSearchReader"), + "llms_txt": ("agno.knowledge.reader.llms_txt_reader", "LLMsTxtReader"), "docling": ("agno.knowledge.reader.docling_reader", "DoclingReader"), } diff --git a/libs/agno/agno/tools/llms_txt.py b/libs/agno/agno/tools/llms_txt.py new file mode 100644 index 0000000000..88945cec75 --- /dev/null +++ b/libs/agno/agno/tools/llms_txt.py @@ -0,0 +1,161 @@ +import json +from typing import Any, List, Optional + +from agno.knowledge.document import Document +from agno.knowledge.knowledge import Knowledge +from agno.tools import Toolkit +from agno.utils.log import log_debug, log_info + + +class LLMsTxtTools(Toolkit): + """Tools for reading llms.txt files and loading their linked documentation into a knowledge base. + + The llms.txt format (see https://llmstxt.org) is a standardized way for websites to provide + LLM-friendly documentation indexes. + + This toolkit provides two usage modes: + + **Agentic mode (without knowledge):** The agent gets two tools: + - `get_llms_txt_index` - reads the llms.txt and returns the index of available docs + - `read_llms_txt_url` - fetches a specific URL from the index + The agent reads the index, decides which pages are relevant, and fetches only those. + + **Knowledge mode (with knowledge):** The agent gets one tool: + - `read_llms_txt_and_load_knowledge` - reads the llms.txt, fetches all linked pages, + and loads them into the knowledge base. + + Args: + knowledge: Optional Knowledge instance. When provided, enables knowledge loading mode. + max_urls: Maximum number of linked URLs to fetch when loading into knowledge. Defaults to 100. + timeout: HTTP request timeout in seconds. Defaults to 30. + skip_optional: Whether to skip URLs listed in the "Optional" section. Defaults to False. + + Example: + # Agentic mode - agent reads index and picks which docs to fetch + tools = LLMsTxtTools() + agent = Agent(tools=[tools]) + + # Knowledge mode - bulk load all docs into KB + knowledge = Knowledge(vector_db=my_vector_db) + tools = LLMsTxtTools(knowledge=knowledge) + agent = Agent(tools=[tools], knowledge=knowledge) + """ + + def __init__( + self, + knowledge: Optional[Knowledge] = None, + max_urls: int = 100, + timeout: int = 30, + skip_optional: bool = False, + **kwargs, + ): + self.knowledge: Optional[Knowledge] = knowledge + self.max_urls = max_urls + self.timeout = timeout + self.skip_optional = skip_optional + + tools: List[Any] = [] + if self.knowledge is not None: + tools.append(self.read_llms_txt_and_load_knowledge) + else: + tools.append(self.get_llms_txt_index) + tools.append(self.read_llms_txt_url) + + super().__init__(name="llms_txt_tools", tools=tools, **kwargs) + + def get_llms_txt_index(self, url: str) -> str: + """Reads an llms.txt file and returns the index of all available documentation pages. + + An llms.txt file is a standardized index of documentation for a project. + This function reads the index and returns all available pages with their titles, + URLs, descriptions, and sections. Use this to discover what documentation is + available, then use read_llms_txt_url to fetch specific pages. + + :param url: The URL of the llms.txt file (e.g. https://docs.example.com/llms.txt). + :return: JSON with the overview and list of available documentation pages. + """ + from agno.knowledge.reader.llms_txt_reader import LLMsTxtReader + + reader = LLMsTxtReader( + timeout=self.timeout, + skip_optional=self.skip_optional, + ) + + log_info(f"Reading llms.txt index from {url}") + llms_txt_content = reader._fetch_url(url) + if not llms_txt_content: + return f"Failed to fetch llms.txt from {url}" + + overview, entries = reader._parse_llms_txt(llms_txt_content, url) + + index = { + "overview": overview, + "pages": [ + { + "title": entry.title, + "url": entry.url, + "description": entry.description, + "section": entry.section, + } + for entry in entries + ], + "total_pages": len(entries), + } + return json.dumps(index) + + def read_llms_txt_url(self, url: str) -> str: + """Fetches and returns the content of a specific documentation page URL. + + Use this after calling get_llms_txt_index to fetch the content of specific pages + you want to read. You can call this multiple times for different URLs. + + :param url: The URL of the documentation page to read. + :return: The text content of the page. + """ + from agno.knowledge.reader.llms_txt_reader import LLMsTxtReader + + reader = LLMsTxtReader(timeout=self.timeout) + + log_debug(f"Fetching URL: {url}") + content = reader._fetch_url(url) + if not content: + return f"Failed to fetch content from {url}" + + return content + + def read_llms_txt_and_load_knowledge(self, url: str) -> str: + """Reads an llms.txt file, fetches all linked documentation pages, and loads them into the knowledge base. + + An llms.txt file is a standardized index of documentation for a project. + This function reads the index, fetches every linked page, and stores the content + in the knowledge base for future retrieval. + + :param url: The URL of the llms.txt file (e.g. https://docs.example.com/llms.txt). + :return: Summary of what was loaded into the knowledge base. + """ + if self.knowledge is None: + return "Knowledge base not provided" + + from agno.knowledge.reader.llms_txt_reader import LLMsTxtReader + + reader = LLMsTxtReader( + max_urls=self.max_urls, + timeout=self.timeout, + skip_optional=self.skip_optional, + ) + + log_info(f"Reading llms.txt from {url}") + documents: List[Document] = reader.read(url=url) + + if not documents: + return f"No documents found in llms.txt at {url}" + + log_debug(f"Loading {len(documents)} documents into knowledge base") + for doc in documents: + self.knowledge.insert( + text_content=doc.content, + name=doc.name, + metadata=doc.meta_data, + ) + + return f"Successfully loaded {len(documents)} documents from llms.txt into the knowledge base" diff --git a/libs/agno/tests/unit/tools/test_llms_txt.py b/libs/agno/tests/unit/tools/test_llms_txt.py new file mode 100644 index 0000000000..ec20971273 --- /dev/null +++ b/libs/agno/tests/unit/tools/test_llms_txt.py @@ -0,0 +1,398 @@ +"""Unit tests for LLMsTxtTools and LLMsTxtReader.""" + +import json +from unittest.mock import MagicMock, patch + +import httpx +import pytest + +bs4 = pytest.importorskip("bs4") + +from agno.knowledge.reader.llms_txt_reader import LLMsTxtEntry, LLMsTxtReader # noqa: E402 +from agno.tools.llms_txt import LLMsTxtTools # noqa: E402 + +# --------------------------------------------------------------------------- +# Sample llms.txt content for testing +# --------------------------------------------------------------------------- + +SAMPLE_LLMS_TXT = """# Acme Project + +> Acme is a framework for building AI applications. + +Acme makes it easy to build production-ready AI agents. + +## Getting Started + +- [Introduction](https://docs.acme.com/introduction): Overview of Acme +- [Installation](https://docs.acme.com/installation): How to install Acme +- [Quickstart](https://docs.acme.com/quickstart): Build your first agent + +## API Reference + +- [Agent API](https://docs.acme.com/api/agent): Agent class reference +- [Tools API](https://docs.acme.com/api/tools): Tools class reference + +## Optional + +- [Changelog](https://docs.acme.com/changelog): Release notes +- [Contributing](https://docs.acme.com/contributing): How to contribute +""" + +SAMPLE_LLMS_TXT_RELATIVE = """# My Project + +> A project with relative links. + +## Docs + +- [Guide](/docs/guide): The guide +- [API](api/reference): API docs +""" + + +# --------------------------------------------------------------------------- +# LLMsTxtReader tests +# --------------------------------------------------------------------------- + + +class TestLLMsTxtReaderInit: + def test_defaults(self): + reader = LLMsTxtReader() + assert reader.max_urls == 100 + assert reader.timeout == 30 + assert reader.proxy is None + assert reader.include_llms_txt_content is True + assert reader.skip_optional is False + + def test_custom_params(self): + reader = LLMsTxtReader(max_urls=50, timeout=10, skip_optional=True) + assert reader.max_urls == 50 + assert reader.timeout == 10 + assert reader.skip_optional is True + + +class TestParseLLMsTxt: + def test_parses_entries(self): + reader = LLMsTxtReader() + overview, entries = reader._parse_llms_txt(SAMPLE_LLMS_TXT, "https://docs.acme.com/llms.txt") + + assert len(entries) == 7 + assert entries[0].title == "Introduction" + assert entries[0].url == "https://docs.acme.com/introduction" + assert entries[0].description == "Overview of Acme" + assert entries[0].section == "Getting Started" + + def test_parses_overview(self): + reader = LLMsTxtReader() + overview, entries = reader._parse_llms_txt(SAMPLE_LLMS_TXT, "https://docs.acme.com/llms.txt") + + assert "# Acme Project" in overview + assert "Acme makes it easy" in overview + + def test_sections_assigned(self): + reader = LLMsTxtReader() + _, entries = reader._parse_llms_txt(SAMPLE_LLMS_TXT, "https://docs.acme.com/llms.txt") + + sections = {e.section for e in entries} + assert sections == {"Getting Started", "API Reference", "Optional"} + + def test_skip_optional(self): + reader = LLMsTxtReader(skip_optional=True) + _, entries = reader._parse_llms_txt(SAMPLE_LLMS_TXT, "https://docs.acme.com/llms.txt") + + assert len(entries) == 5 + assert all(e.section != "Optional" for e in entries) + + def test_relative_urls_resolved(self): + reader = LLMsTxtReader() + _, entries = reader._parse_llms_txt(SAMPLE_LLMS_TXT_RELATIVE, "https://example.com/llms.txt") + + assert entries[0].url == "https://example.com/docs/guide" + assert entries[1].url == "https://example.com/api/reference" + + def test_empty_content(self): + reader = LLMsTxtReader() + overview, entries = reader._parse_llms_txt("", "https://example.com/llms.txt") + + assert overview == "" + assert entries == [] + + def test_no_links(self): + content = "# Title\n\nSome overview text.\n\n## Section\n\nNo links here." + reader = LLMsTxtReader() + overview, entries = reader._parse_llms_txt(content, "https://example.com/llms.txt") + + assert "# Title" in overview + assert entries == [] + + +class TestExtractContent: + def test_extracts_from_main_tag(self): + reader = LLMsTxtReader() + html = "
Main content here
Foot
" + result = reader._extract_content(html) + assert "Main content here" in result + assert "Nav" not in result + + def test_extracts_from_body_fallback(self): + reader = LLMsTxtReader() + html = "
Body content
" + result = reader._extract_content(html) + assert "Body content" in result + + def test_strips_script_and_style(self): + reader = LLMsTxtReader() + html = "

Text

" + result = reader._extract_content(html) + assert "var x" not in result + assert "Text" in result + + +class TestFetchUrl: + def test_returns_text_for_plain_content(self): + reader = LLMsTxtReader() + mock_response = MagicMock() + mock_response.headers = {"content-type": "text/plain"} + mock_response.text = "Plain text content" + mock_response.raise_for_status = MagicMock() + + with patch("httpx.get", return_value=mock_response): + result = reader._fetch_url("https://example.com/file.txt") + + assert result == "Plain text content" + + def test_extracts_html_content(self): + reader = LLMsTxtReader() + mock_response = MagicMock() + mock_response.headers = {"content-type": "text/html"} + mock_response.text = "
Extracted
" + mock_response.raise_for_status = MagicMock() + + with patch("httpx.get", return_value=mock_response): + result = reader._fetch_url("https://example.com/page") + + assert "Extracted" in result + + def test_returns_none_on_http_error(self): + reader = LLMsTxtReader() + + with patch( + "httpx.get", + side_effect=httpx.HTTPStatusError("error", request=MagicMock(), response=MagicMock(status_code=404)), + ): + result = reader._fetch_url("https://example.com/missing") + + assert result is None + + def test_returns_none_on_request_error(self): + reader = LLMsTxtReader() + + with patch("httpx.get", side_effect=httpx.RequestError("connection failed")): + result = reader._fetch_url("https://example.com/down") + + assert result is None + + +class TestBuildDocuments: + def test_builds_overview_and_linked_docs(self): + reader = LLMsTxtReader(chunk=False) + entries = [ + LLMsTxtEntry(title="Intro", url="https://example.com/intro", description="Intro page", section="Docs"), + ] + fetched = {"https://example.com/intro": "Introduction content here"} + + docs = reader._build_documents("Overview text", entries, fetched, "https://example.com/llms.txt", None) + + assert len(docs) == 2 + assert docs[0].meta_data["type"] == "llms_txt_overview" + assert docs[0].content == "Overview text" + assert docs[1].meta_data["type"] == "llms_txt_linked_doc" + assert docs[1].name == "Intro" + assert docs[1].content == "Introduction content here" + + def test_skips_unfetched_entries(self): + reader = LLMsTxtReader(chunk=False) + entries = [ + LLMsTxtEntry(title="Missing", url="https://example.com/missing", description="", section="Docs"), + ] + fetched = {} + + docs = reader._build_documents("Overview", entries, fetched, "https://example.com/llms.txt", None) + + # Only the overview doc + assert len(docs) == 1 + + def test_excludes_overview_when_disabled(self): + reader = LLMsTxtReader(chunk=False, include_llms_txt_content=False) + entries = [ + LLMsTxtEntry(title="Page", url="https://example.com/page", description="", section="Docs"), + ] + fetched = {"https://example.com/page": "Page content"} + + docs = reader._build_documents("Overview", entries, fetched, "https://example.com/llms.txt", None) + + assert len(docs) == 1 + assert docs[0].meta_data["type"] == "llms_txt_linked_doc" + + +class TestRead: + def test_read_fetches_and_builds_docs(self): + reader = LLMsTxtReader(max_urls=5, chunk=False) + + def mock_fetch(url): + if url == "https://example.com/llms.txt": + return SAMPLE_LLMS_TXT + return f"Content of {url}" + + with patch.object(reader, "_fetch_url", side_effect=mock_fetch): + docs = reader.read("https://example.com/llms.txt") + + # 1 overview + 5 linked docs (max_urls=5) + assert len(docs) == 6 + assert docs[0].meta_data["type"] == "llms_txt_overview" + + def test_read_returns_empty_on_fetch_failure(self): + reader = LLMsTxtReader() + + with patch.object(reader, "_fetch_url", return_value=None): + docs = reader.read("https://example.com/llms.txt") + + assert docs == [] + + def test_max_urls_limits_fetched_pages(self): + reader = LLMsTxtReader(max_urls=2, chunk=False) + + def mock_fetch(url): + if url == "https://example.com/llms.txt": + return SAMPLE_LLMS_TXT + return f"Content of {url}" + + with patch.object(reader, "_fetch_url", side_effect=mock_fetch): + docs = reader.read("https://example.com/llms.txt") + + # 1 overview + 2 linked docs (max_urls=2) + assert len(docs) == 3 + + +# --------------------------------------------------------------------------- +# LLMsTxtTools tests +# --------------------------------------------------------------------------- + + +class TestLLMsTxtToolsInit: + def test_without_knowledge_registers_agentic_tools(self): + tools = LLMsTxtTools() + func_names = [func.name for func in tools.functions.values()] + assert "get_llms_txt_index" in func_names + assert "read_llms_txt_url" in func_names + assert "read_llms_txt_and_load_knowledge" not in func_names + + def test_with_knowledge_registers_load(self): + mock_knowledge = MagicMock() + tools = LLMsTxtTools(knowledge=mock_knowledge) + func_names = [func.name for func in tools.functions.values()] + assert "read_llms_txt_and_load_knowledge" in func_names + assert "get_llms_txt_index" not in func_names + + def test_custom_params(self): + tools = LLMsTxtTools(max_urls=50, timeout=10, skip_optional=True) + assert tools.max_urls == 50 + assert tools.timeout == 10 + assert tools.skip_optional is True + + +class TestGetLLMsTxtIndex: + def test_returns_index_json(self): + tools = LLMsTxtTools() + + mock_response = MagicMock() + mock_response.headers = {"content-type": "text/plain"} + mock_response.text = SAMPLE_LLMS_TXT + mock_response.raise_for_status = MagicMock() + + with patch("httpx.get", return_value=mock_response): + result = tools.get_llms_txt_index("https://docs.acme.com/llms.txt") + + data = json.loads(result) + assert data["total_pages"] == 7 + assert data["pages"][0]["title"] == "Introduction" + assert data["pages"][0]["url"] == "https://docs.acme.com/introduction" + assert "overview" in data + + def test_returns_error_on_fetch_failure(self): + tools = LLMsTxtTools() + + with patch("httpx.get", side_effect=httpx.RequestError("connection failed")): + result = tools.get_llms_txt_index("https://example.com/llms.txt") + + assert "Failed to fetch" in result + + +class TestReadLLMsTxtUrl: + def test_returns_page_content(self): + tools = LLMsTxtTools() + + mock_response = MagicMock() + mock_response.headers = {"content-type": "text/plain"} + mock_response.text = "Page content here" + mock_response.raise_for_status = MagicMock() + + with patch("httpx.get", return_value=mock_response): + result = tools.read_llms_txt_url("https://docs.acme.com/introduction") + + assert result == "Page content here" + + def test_returns_error_on_fetch_failure(self): + tools = LLMsTxtTools() + + with patch("httpx.get", side_effect=httpx.RequestError("connection failed")): + result = tools.read_llms_txt_url("https://example.com/missing") + + assert "Failed to fetch" in result + + +class TestLoadKnowledge: + def test_inserts_into_knowledge(self): + mock_knowledge = MagicMock() + tools = LLMsTxtTools(knowledge=mock_knowledge) + + mock_response = MagicMock() + mock_response.headers = {"content-type": "text/plain"} + mock_response.text = "Page content" + mock_response.raise_for_status = MagicMock() + + # Simple llms.txt with one link + llms_content = "# Test\n\n## Docs\n\n- [Page](https://example.com/page): A page\n" + call_count = 0 + + def mock_get(url, **kwargs): + nonlocal call_count + call_count += 1 + resp = MagicMock() + resp.headers = {"content-type": "text/plain"} + resp.raise_for_status = MagicMock() + if call_count == 1: + resp.text = llms_content + else: + resp.text = "Page content" + return resp + + with patch("httpx.get", side_effect=mock_get): + result = tools.read_llms_txt_and_load_knowledge("https://example.com/llms.txt") + + assert mock_knowledge.insert.called + assert "Successfully loaded" in result + + def test_returns_message_when_no_knowledge(self): + tools = LLMsTxtTools() + # Force-call the knowledge method even though it wouldn't be registered + result = tools.read_llms_txt_and_load_knowledge("https://example.com/llms.txt") + assert result == "Knowledge base not provided" + + def test_returns_message_when_no_docs(self): + mock_knowledge = MagicMock() + tools = LLMsTxtTools(knowledge=mock_knowledge) + + with patch("httpx.get", side_effect=httpx.RequestError("connection failed")): + result = tools.read_llms_txt_and_load_knowledge("https://example.com/llms.txt") + + assert "No documents found" in result From f2a91010081128c6f7b36d0a87cd368656d13c00 Mon Sep 17 00:00:00 2001 From: Yash Pratap Solanky <101447028+ysolanky@users.noreply.github.com> Date: Fri, 10 Apr 2026 10:48:36 -0400 Subject: [PATCH 02/23] fix: address review issues in LLMsTxtReader and LLMsTxtTools (#7465) ## Summary Addresses code review feedback on #7458. Fixes several issues in the LLMsTxtReader and LLMsTxtTools implementation. **Changes:** - **Lazy BeautifulSoup import** - Deferred to `_extract_content()` instead of hard-failing at module import time - **Variable shadowing fix** - Renamed `url` to `entry_url` in `async_read()` dict comprehension to avoid shadowing the method parameter - **Concurrency limiting** - Added `asyncio.Semaphore(10)` to prevent overwhelming target servers when fetching 100+ URLs concurrently - **Better text extraction** - Changed `_extract_content()` separator from `" "` to `"\n"` to preserve document structure - **Public API methods** - Renamed `_fetch_url` / `_parse_llms_txt` to `fetch_url` / `parse_llms_txt` since they are called by the toolkit - **Reader reuse** - LLMsTxtTools now creates a single `LLMsTxtReader` instance in `__init__` instead of per tool call - **Async tool variants** - Added `aget_llms_txt_index`, `aread_llms_txt_url`, `aread_llms_txt_and_load_knowledge` registered via `async_tools` following the codebase convention (e.g. BrandfetchTools) - **New tests** - Added tests for async tool registration, reader reuse, and newline preservation in HTML extraction ## Type of change - [x] Improvement --- ## Checklist - [x] Code complies with style guidelines - [x] Ran format/validation scripts (`./scripts/format.sh` and `./scripts/validate.sh`) - [x] Self-review completed - [x] Documentation updated (comments, docstrings) - [x] Tests added/updated (if applicable) ### Duplicate and AI-Generated PR Check - [x] I have searched existing [open pull requests](../../pulls) and confirmed that no other PR already addresses this issue - [x] Check if this PR was entirely AI-generated (by Copilot, Claude Code, Cursor, etc.) --- ## Additional Notes All 36 tests pass (up from 32 - added 4 new tests for async registration, reader reuse, and HTML newline preservation). --- .../agno/knowledge/reader/llms_txt_reader.py | 48 ++++---- libs/agno/agno/tools/llms_txt.py | 104 ++++++++++++++---- libs/agno/tests/unit/tools/test_llms_txt.py | 54 ++++++--- 3 files changed, 152 insertions(+), 54 deletions(-) diff --git a/libs/agno/agno/knowledge/reader/llms_txt_reader.py b/libs/agno/agno/knowledge/reader/llms_txt_reader.py index 66e2256336..5952e3679a 100644 --- a/libs/agno/agno/knowledge/reader/llms_txt_reader.py +++ b/libs/agno/agno/knowledge/reader/llms_txt_reader.py @@ -14,17 +14,15 @@ from agno.knowledge.types import ContentType from agno.utils.log import log_debug, log_error, log_warning -try: - from bs4 import BeautifulSoup -except ImportError: - raise ImportError("The `bs4` package is not installed. Please install it via `pip install beautifulsoup4`.") - - # Pattern to match markdown links: - [Title](url) or - [Title](url): description +# Note: titles with nested brackets (e.g. [Agent [Beta]](url)) are not supported. _LINK_PATTERN = re.compile(r"-\s+\[([^\]]+)\]\(([^)]+)\)(?::\s*(.+))?") # Pattern to match H2 section headers _SECTION_PATTERN = re.compile(r"^##\s+(.+)$", re.MULTILINE) +# Maximum number of concurrent HTTP requests when fetching linked pages +_MAX_CONCURRENT_FETCHES = 10 + @dataclass class LLMsTxtEntry: @@ -96,7 +94,7 @@ def get_supported_chunking_strategies(cls) -> List[ChunkingStrategyType]: def get_supported_content_types(cls) -> List[ContentType]: return [ContentType.URL] - def _parse_llms_txt(self, content: str, base_url: str) -> Tuple[str, List[LLMsTxtEntry]]: + def parse_llms_txt(self, content: str, base_url: str) -> Tuple[str, List[LLMsTxtEntry]]: """Parse an llms.txt file and extract all linked URLs. Args: @@ -155,6 +153,11 @@ def _parse_llms_txt(self, content: str, base_url: str) -> Tuple[str, List[LLMsTx def _extract_content(self, html: str) -> str: """Extract readable text content from HTML.""" + try: + from bs4 import BeautifulSoup + except ImportError: + raise ImportError("The `bs4` package is not installed. Please install it via `pip install beautifulsoup4`.") + soup = BeautifulSoup(html, "html.parser") # Remove unwanted elements @@ -164,15 +167,15 @@ def _extract_content(self, html: str) -> str: # Try to find main content main = soup.find("main") or soup.find("article") or soup.find(attrs={"role": "main"}) if main: - return main.get_text(strip=True, separator=" ") + return main.get_text(separator="\n", strip=True) body = soup.find("body") if body: - return body.get_text(strip=True, separator=" ") + return body.get_text(separator="\n", strip=True) - return soup.get_text(strip=True, separator=" ") + return soup.get_text(separator="\n", strip=True) - def _fetch_url(self, url: str) -> Optional[str]: + def fetch_url(self, url: str) -> Optional[str]: """Fetch content from a URL, returning text for text-like content or extracted text from HTML.""" try: log_debug(f"Fetching: {url}") @@ -205,7 +208,7 @@ def _fetch_url(self, url: str) -> Optional[str]: log_error(f"Failed to fetch {url}: {str(e)}") return None - async def _async_fetch_url(self, client: httpx.AsyncClient, url: str) -> Optional[str]: + async def async_fetch_url(self, client: httpx.AsyncClient, url: str) -> Optional[str]: """Asynchronously fetch content from a URL.""" try: log_debug(f"Fetching asynchronously: {url}") @@ -296,13 +299,13 @@ def read(self, url: str, name: Optional[str] = None) -> List[Document]: log_debug(f"Reading llms.txt: {url}") # Fetch the llms.txt file - llms_txt_content = self._fetch_url(url) + llms_txt_content = self.fetch_url(url) if not llms_txt_content: log_error(f"Failed to fetch llms.txt from {url}") return [] # Parse the llms.txt content - overview, entries = self._parse_llms_txt(llms_txt_content, url) + overview, entries = self.parse_llms_txt(llms_txt_content, url) log_debug(f"Found {len(entries)} linked URLs in llms.txt") # Limit the number of URLs to fetch @@ -313,7 +316,7 @@ def read(self, url: str, name: Optional[str] = None) -> List[Document]: # Fetch all linked pages fetched: Dict[str, str] = {} for entry in entries_to_fetch: - content = self._fetch_url(entry.url) + content = self.fetch_url(entry.url) if content: fetched[entry.url] = content @@ -335,13 +338,13 @@ async def async_read(self, url: str, name: Optional[str] = None) -> List[Documen client_args = {"proxy": self.proxy} if self.proxy else {} async with httpx.AsyncClient(**client_args) as client: # type: ignore # Fetch the llms.txt file - llms_txt_content = await self._async_fetch_url(client, url) + llms_txt_content = await self.async_fetch_url(client, url) if not llms_txt_content: log_error(f"Failed to fetch llms.txt from {url}") return [] # Parse the llms.txt content - overview, entries = self._parse_llms_txt(llms_txt_content, url) + overview, entries = self.parse_llms_txt(llms_txt_content, url) log_debug(f"Found {len(entries)} linked URLs in llms.txt") # Limit the number of URLs to fetch @@ -349,13 +352,16 @@ async def async_read(self, url: str, name: Optional[str] = None) -> List[Documen if len(entries) > self.max_urls: log_warning(f"Limiting to {self.max_urls} URLs (found {len(entries)})") - # Fetch all linked pages concurrently + # Fetch all linked pages concurrently with a semaphore to limit parallelism + semaphore = asyncio.Semaphore(_MAX_CONCURRENT_FETCHES) + async def _fetch_entry(entry: LLMsTxtEntry) -> Tuple[str, Optional[str]]: - content = await self._async_fetch_url(client, entry.url) - return entry.url, content + async with semaphore: + content = await self.async_fetch_url(client, entry.url) + return entry.url, content results = await asyncio.gather(*[_fetch_entry(e) for e in entries_to_fetch]) - fetched: Dict[str, str] = {url: content for url, content in results if content} + fetched: Dict[str, str] = {entry_url: content for entry_url, content in results if content} log_debug(f"Successfully fetched {len(fetched)}/{len(entries_to_fetch)} linked pages") return self._build_documents(overview, entries_to_fetch, fetched, url, name) diff --git a/libs/agno/agno/tools/llms_txt.py b/libs/agno/agno/tools/llms_txt.py index 88945cec75..a7a050b32d 100644 --- a/libs/agno/agno/tools/llms_txt.py +++ b/libs/agno/agno/tools/llms_txt.py @@ -49,19 +49,30 @@ def __init__( skip_optional: bool = False, **kwargs, ): + from agno.knowledge.reader.llms_txt_reader import LLMsTxtReader + self.knowledge: Optional[Knowledge] = knowledge self.max_urls = max_urls self.timeout = timeout self.skip_optional = skip_optional + self.reader = LLMsTxtReader( + max_urls=max_urls, + timeout=timeout, + skip_optional=skip_optional, + ) tools: List[Any] = [] + async_tools_list: List[tuple] = [] if self.knowledge is not None: tools.append(self.read_llms_txt_and_load_knowledge) + async_tools_list.append((self.aread_llms_txt_and_load_knowledge, "read_llms_txt_and_load_knowledge")) else: tools.append(self.get_llms_txt_index) tools.append(self.read_llms_txt_url) + async_tools_list.append((self.aget_llms_txt_index, "get_llms_txt_index")) + async_tools_list.append((self.aread_llms_txt_url, "read_llms_txt_url")) - super().__init__(name="llms_txt_tools", tools=tools, **kwargs) + super().__init__(name="llms_txt_tools", tools=tools, async_tools=async_tools_list, **kwargs) def get_llms_txt_index(self, url: str) -> str: """Reads an llms.txt file and returns the index of all available documentation pages. @@ -74,19 +85,44 @@ def get_llms_txt_index(self, url: str) -> str: :param url: The URL of the llms.txt file (e.g. https://docs.example.com/llms.txt). :return: JSON with the overview and list of available documentation pages. """ - from agno.knowledge.reader.llms_txt_reader import LLMsTxtReader + log_info(f"Reading llms.txt index from {url}") + llms_txt_content = self.reader.fetch_url(url) + if not llms_txt_content: + return f"Failed to fetch llms.txt from {url}" - reader = LLMsTxtReader( - timeout=self.timeout, - skip_optional=self.skip_optional, - ) + overview, entries = self.reader.parse_llms_txt(llms_txt_content, url) + + index = { + "overview": overview, + "pages": [ + { + "title": entry.title, + "url": entry.url, + "description": entry.description, + "section": entry.section, + } + for entry in entries + ], + "total_pages": len(entries), + } + return json.dumps(index) + + async def aget_llms_txt_index(self, url: str) -> str: + """Async variant of get_llms_txt_index. + + :param url: The URL of the llms.txt file (e.g. https://docs.example.com/llms.txt). + :return: JSON with the overview and list of available documentation pages. + """ + import httpx log_info(f"Reading llms.txt index from {url}") - llms_txt_content = reader._fetch_url(url) + async with httpx.AsyncClient() as client: + llms_txt_content = await self.reader.async_fetch_url(client, url) + if not llms_txt_content: return f"Failed to fetch llms.txt from {url}" - overview, entries = reader._parse_llms_txt(llms_txt_content, url) + overview, entries = self.reader.parse_llms_txt(llms_txt_content, url) index = { "overview": overview, @@ -112,12 +148,25 @@ def read_llms_txt_url(self, url: str) -> str: :param url: The URL of the documentation page to read. :return: The text content of the page. """ - from agno.knowledge.reader.llms_txt_reader import LLMsTxtReader + log_debug(f"Fetching URL: {url}") + content = self.reader.fetch_url(url) + if not content: + return f"Failed to fetch content from {url}" + + return content + + async def aread_llms_txt_url(self, url: str) -> str: + """Async variant of read_llms_txt_url. - reader = LLMsTxtReader(timeout=self.timeout) + :param url: The URL of the documentation page to read. + :return: The text content of the page. + """ + import httpx log_debug(f"Fetching URL: {url}") - content = reader._fetch_url(url) + async with httpx.AsyncClient() as client: + content = await self.reader.async_fetch_url(client, url) + if not content: return f"Failed to fetch content from {url}" @@ -136,23 +185,40 @@ def read_llms_txt_and_load_knowledge(self, url: str) -> str: if self.knowledge is None: return "Knowledge base not provided" - from agno.knowledge.reader.llms_txt_reader import LLMsTxtReader + log_info(f"Reading llms.txt from {url}") + documents: List[Document] = self.reader.read(url=url) - reader = LLMsTxtReader( - max_urls=self.max_urls, - timeout=self.timeout, - skip_optional=self.skip_optional, - ) + if not documents: + return f"No documents found in llms.txt at {url}" + + log_debug(f"Loading {len(documents)} documents into knowledge base") + for doc in documents: + self.knowledge.insert( + text_content=doc.content, + name=doc.name, + metadata=doc.meta_data, + ) + + return f"Successfully loaded {len(documents)} documents from llms.txt into the knowledge base" + + async def aread_llms_txt_and_load_knowledge(self, url: str) -> str: + """Async variant of read_llms_txt_and_load_knowledge. + + :param url: The URL of the llms.txt file (e.g. https://docs.example.com/llms.txt). + :return: Summary of what was loaded into the knowledge base. + """ + if self.knowledge is None: + return "Knowledge base not provided" log_info(f"Reading llms.txt from {url}") - documents: List[Document] = reader.read(url=url) + documents: List[Document] = await self.reader.async_read(url=url) if not documents: return f"No documents found in llms.txt at {url}" log_debug(f"Loading {len(documents)} documents into knowledge base") for doc in documents: - self.knowledge.insert( + await self.knowledge.ainsert( text_content=doc.content, name=doc.name, metadata=doc.meta_data, diff --git a/libs/agno/tests/unit/tools/test_llms_txt.py b/libs/agno/tests/unit/tools/test_llms_txt.py index ec20971273..b3cce25d04 100644 --- a/libs/agno/tests/unit/tools/test_llms_txt.py +++ b/libs/agno/tests/unit/tools/test_llms_txt.py @@ -73,7 +73,7 @@ def test_custom_params(self): class TestParseLLMsTxt: def test_parses_entries(self): reader = LLMsTxtReader() - overview, entries = reader._parse_llms_txt(SAMPLE_LLMS_TXT, "https://docs.acme.com/llms.txt") + overview, entries = reader.parse_llms_txt(SAMPLE_LLMS_TXT, "https://docs.acme.com/llms.txt") assert len(entries) == 7 assert entries[0].title == "Introduction" @@ -83,35 +83,35 @@ def test_parses_entries(self): def test_parses_overview(self): reader = LLMsTxtReader() - overview, entries = reader._parse_llms_txt(SAMPLE_LLMS_TXT, "https://docs.acme.com/llms.txt") + overview, entries = reader.parse_llms_txt(SAMPLE_LLMS_TXT, "https://docs.acme.com/llms.txt") assert "# Acme Project" in overview assert "Acme makes it easy" in overview def test_sections_assigned(self): reader = LLMsTxtReader() - _, entries = reader._parse_llms_txt(SAMPLE_LLMS_TXT, "https://docs.acme.com/llms.txt") + _, entries = reader.parse_llms_txt(SAMPLE_LLMS_TXT, "https://docs.acme.com/llms.txt") sections = {e.section for e in entries} assert sections == {"Getting Started", "API Reference", "Optional"} def test_skip_optional(self): reader = LLMsTxtReader(skip_optional=True) - _, entries = reader._parse_llms_txt(SAMPLE_LLMS_TXT, "https://docs.acme.com/llms.txt") + _, entries = reader.parse_llms_txt(SAMPLE_LLMS_TXT, "https://docs.acme.com/llms.txt") assert len(entries) == 5 assert all(e.section != "Optional" for e in entries) def test_relative_urls_resolved(self): reader = LLMsTxtReader() - _, entries = reader._parse_llms_txt(SAMPLE_LLMS_TXT_RELATIVE, "https://example.com/llms.txt") + _, entries = reader.parse_llms_txt(SAMPLE_LLMS_TXT_RELATIVE, "https://example.com/llms.txt") assert entries[0].url == "https://example.com/docs/guide" assert entries[1].url == "https://example.com/api/reference" def test_empty_content(self): reader = LLMsTxtReader() - overview, entries = reader._parse_llms_txt("", "https://example.com/llms.txt") + overview, entries = reader.parse_llms_txt("", "https://example.com/llms.txt") assert overview == "" assert entries == [] @@ -119,7 +119,7 @@ def test_empty_content(self): def test_no_links(self): content = "# Title\n\nSome overview text.\n\n## Section\n\nNo links here." reader = LLMsTxtReader() - overview, entries = reader._parse_llms_txt(content, "https://example.com/llms.txt") + overview, entries = reader.parse_llms_txt(content, "https://example.com/llms.txt") assert "# Title" in overview assert entries == [] @@ -146,6 +146,14 @@ def test_strips_script_and_style(self): assert "var x" not in result assert "Text" in result + def test_preserves_structure_with_newlines(self): + reader = LLMsTxtReader() + html = "

First paragraph

Second paragraph

" + result = reader._extract_content(html) + assert "First paragraph" in result + assert "Second paragraph" in result + assert "\n" in result + class TestFetchUrl: def test_returns_text_for_plain_content(self): @@ -156,7 +164,7 @@ def test_returns_text_for_plain_content(self): mock_response.raise_for_status = MagicMock() with patch("httpx.get", return_value=mock_response): - result = reader._fetch_url("https://example.com/file.txt") + result = reader.fetch_url("https://example.com/file.txt") assert result == "Plain text content" @@ -168,7 +176,7 @@ def test_extracts_html_content(self): mock_response.raise_for_status = MagicMock() with patch("httpx.get", return_value=mock_response): - result = reader._fetch_url("https://example.com/page") + result = reader.fetch_url("https://example.com/page") assert "Extracted" in result @@ -179,7 +187,7 @@ def test_returns_none_on_http_error(self): "httpx.get", side_effect=httpx.HTTPStatusError("error", request=MagicMock(), response=MagicMock(status_code=404)), ): - result = reader._fetch_url("https://example.com/missing") + result = reader.fetch_url("https://example.com/missing") assert result is None @@ -187,7 +195,7 @@ def test_returns_none_on_request_error(self): reader = LLMsTxtReader() with patch("httpx.get", side_effect=httpx.RequestError("connection failed")): - result = reader._fetch_url("https://example.com/down") + result = reader.fetch_url("https://example.com/down") assert result is None @@ -243,7 +251,7 @@ def mock_fetch(url): return SAMPLE_LLMS_TXT return f"Content of {url}" - with patch.object(reader, "_fetch_url", side_effect=mock_fetch): + with patch.object(reader, "fetch_url", side_effect=mock_fetch): docs = reader.read("https://example.com/llms.txt") # 1 overview + 5 linked docs (max_urls=5) @@ -253,7 +261,7 @@ def mock_fetch(url): def test_read_returns_empty_on_fetch_failure(self): reader = LLMsTxtReader() - with patch.object(reader, "_fetch_url", return_value=None): + with patch.object(reader, "fetch_url", return_value=None): docs = reader.read("https://example.com/llms.txt") assert docs == [] @@ -266,7 +274,7 @@ def mock_fetch(url): return SAMPLE_LLMS_TXT return f"Content of {url}" - with patch.object(reader, "_fetch_url", side_effect=mock_fetch): + with patch.object(reader, "fetch_url", side_effect=mock_fetch): docs = reader.read("https://example.com/llms.txt") # 1 overview + 2 linked docs (max_urls=2) @@ -286,6 +294,12 @@ def test_without_knowledge_registers_agentic_tools(self): assert "read_llms_txt_url" in func_names assert "read_llms_txt_and_load_knowledge" not in func_names + def test_without_knowledge_registers_async_tools(self): + tools = LLMsTxtTools() + async_func_names = [func.name for func in tools.async_functions.values()] + assert "get_llms_txt_index" in async_func_names + assert "read_llms_txt_url" in async_func_names + def test_with_knowledge_registers_load(self): mock_knowledge = MagicMock() tools = LLMsTxtTools(knowledge=mock_knowledge) @@ -293,12 +307,24 @@ def test_with_knowledge_registers_load(self): assert "read_llms_txt_and_load_knowledge" in func_names assert "get_llms_txt_index" not in func_names + def test_with_knowledge_registers_async_load(self): + mock_knowledge = MagicMock() + tools = LLMsTxtTools(knowledge=mock_knowledge) + async_func_names = [func.name for func in tools.async_functions.values()] + assert "read_llms_txt_and_load_knowledge" in async_func_names + def test_custom_params(self): tools = LLMsTxtTools(max_urls=50, timeout=10, skip_optional=True) assert tools.max_urls == 50 assert tools.timeout == 10 assert tools.skip_optional is True + def test_reader_is_reused(self): + tools = LLMsTxtTools() + assert tools.reader is not None + assert tools.reader.timeout == tools.timeout + assert tools.reader.max_urls == tools.max_urls + class TestGetLLMsTxtIndex: def test_returns_index_json(self): From 482cd73bb9bb484227e9a57c19d216f30a9f5fe7 Mon Sep 17 00:00:00 2001 From: Mustafa Esoofally Date: Fri, 10 Apr 2026 11:05:29 -0400 Subject: [PATCH 03/23] fix: improve LLMsTxtTools async patterns and deduplicate reader logic - Full async docstrings on all 3 async tool methods so the LLM sees proper tool descriptions in async mode - AsyncClient now receives timeout and proxy via _async_client_kwargs() - Module-level httpx import consistent with Brandfetch/Perplexity - Extract _process_response() to deduplicate content-type classification across fetch_url and async_fetch_url --- .../agno/knowledge/reader/llms_txt_reader.py | 37 ++++++------------- libs/agno/agno/tools/llms_txt.py | 37 ++++++++++++++----- 2 files changed, 39 insertions(+), 35 deletions(-) diff --git a/libs/agno/agno/knowledge/reader/llms_txt_reader.py b/libs/agno/agno/knowledge/reader/llms_txt_reader.py index 5952e3679a..138c69f815 100644 --- a/libs/agno/agno/knowledge/reader/llms_txt_reader.py +++ b/libs/agno/agno/knowledge/reader/llms_txt_reader.py @@ -175,6 +175,16 @@ def _extract_content(self, html: str) -> str: return soup.get_text(separator="\n", strip=True) + def _process_response(self, content_type: str, text: str) -> str: + """Classify an HTTP response by content-type and return processed text.""" + if any(t in content_type for t in ["text/plain", "text/markdown"]): + return text + + if "text/html" in content_type or text.strip().startswith((" Optional[str]: """Fetch content from a URL, returning text for text-like content or extracted text from HTML.""" try: @@ -184,20 +194,7 @@ def fetch_url(self, url: str) -> Optional[str]: else: response = httpx.get(url, timeout=self.timeout, follow_redirects=True) response.raise_for_status() - - content_type = response.headers.get("content-type", "") - text = response.text - - # If content is plain text or markdown, return as-is - if any(t in content_type for t in ["text/plain", "text/markdown"]): - return text - - # If content is HTML, extract the text - if "text/html" in content_type or text.strip().startswith((" Optional log_debug(f"Fetching asynchronously: {url}") response = await client.get(url, timeout=self.timeout, follow_redirects=True) response.raise_for_status() - - content_type = response.headers.get("content-type", "") - text = response.text - - if any(t in content_type for t in ["text/plain", "text/markdown"]): - return text - - if "text/html" in content_type or text.strip().startswith((" Dict[str, Any]: + """Build kwargs for httpx.AsyncClient matching the reader's config.""" + kwargs: Dict[str, Any] = {"timeout": httpx.Timeout(self.timeout)} + if self.reader.proxy: + kwargs["proxy"] = self.reader.proxy + return kwargs + def get_llms_txt_index(self, url: str) -> str: """Reads an llms.txt file and returns the index of all available documentation pages. @@ -108,15 +117,18 @@ def get_llms_txt_index(self, url: str) -> str: return json.dumps(index) async def aget_llms_txt_index(self, url: str) -> str: - """Async variant of get_llms_txt_index. + """Reads an llms.txt file and returns the index of all available documentation pages. + + An llms.txt file is a standardized index of documentation for a project. + This function reads the index and returns all available pages with their titles, + URLs, descriptions, and sections. Use this to discover what documentation is + available, then use read_llms_txt_url to fetch specific pages. :param url: The URL of the llms.txt file (e.g. https://docs.example.com/llms.txt). :return: JSON with the overview and list of available documentation pages. """ - import httpx - log_info(f"Reading llms.txt index from {url}") - async with httpx.AsyncClient() as client: + async with httpx.AsyncClient(**self._async_client_kwargs()) as client: llms_txt_content = await self.reader.async_fetch_url(client, url) if not llms_txt_content: @@ -156,15 +168,16 @@ def read_llms_txt_url(self, url: str) -> str: return content async def aread_llms_txt_url(self, url: str) -> str: - """Async variant of read_llms_txt_url. + """Fetches and returns the content of a specific documentation page URL. + + Use this after calling get_llms_txt_index to fetch the content of specific pages + you want to read. You can call this multiple times for different URLs. :param url: The URL of the documentation page to read. :return: The text content of the page. """ - import httpx - log_debug(f"Fetching URL: {url}") - async with httpx.AsyncClient() as client: + async with httpx.AsyncClient(**self._async_client_kwargs()) as client: content = await self.reader.async_fetch_url(client, url) if not content: @@ -202,7 +215,11 @@ def read_llms_txt_and_load_knowledge(self, url: str) -> str: return f"Successfully loaded {len(documents)} documents from llms.txt into the knowledge base" async def aread_llms_txt_and_load_knowledge(self, url: str) -> str: - """Async variant of read_llms_txt_and_load_knowledge. + """Reads an llms.txt file, fetches all linked documentation pages, and loads them into the knowledge base. + + An llms.txt file is a standardized index of documentation for a project. + This function reads the index, fetches every linked page, and stores the content + in the knowledge base for future retrieval. :param url: The URL of the llms.txt file (e.g. https://docs.example.com/llms.txt). :return: Summary of what was loaded into the knowledge base. From f35ab19e3052d566fef512a774f6bfc1e8e3b14c Mon Sep 17 00:00:00 2001 From: Mustafa Esoofally Date: Fri, 10 Apr 2026 11:10:19 -0400 Subject: [PATCH 04/23] fix: delegate knowledge loading to Knowledge.insert() pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of manually reading documents and looping insert(), delegate to self.knowledge.insert(url=url, reader=self.reader) which gives us content hashing, deduplication, status tracking, and proper vector DB insertion — matching the pattern used by WebsiteTools and WikipediaTools. --- libs/agno/agno/tools/llms_txt.py | 33 +++--------------- libs/agno/tests/unit/tools/test_llms_txt.py | 37 ++++----------------- 2 files changed, 10 insertions(+), 60 deletions(-) diff --git a/libs/agno/agno/tools/llms_txt.py b/libs/agno/agno/tools/llms_txt.py index 0b42728f8b..8fa8375d2c 100644 --- a/libs/agno/agno/tools/llms_txt.py +++ b/libs/agno/agno/tools/llms_txt.py @@ -3,7 +3,6 @@ import httpx -from agno.knowledge.document import Document from agno.knowledge.knowledge import Knowledge from agno.tools import Toolkit from agno.utils.log import log_debug, log_info @@ -199,20 +198,8 @@ def read_llms_txt_and_load_knowledge(self, url: str) -> str: return "Knowledge base not provided" log_info(f"Reading llms.txt from {url}") - documents: List[Document] = self.reader.read(url=url) - - if not documents: - return f"No documents found in llms.txt at {url}" - - log_debug(f"Loading {len(documents)} documents into knowledge base") - for doc in documents: - self.knowledge.insert( - text_content=doc.content, - name=doc.name, - metadata=doc.meta_data, - ) - - return f"Successfully loaded {len(documents)} documents from llms.txt into the knowledge base" + self.knowledge.insert(url=url, reader=self.reader) + return f"Successfully loaded documentation from {url} into the knowledge base" async def aread_llms_txt_and_load_knowledge(self, url: str) -> str: """Reads an llms.txt file, fetches all linked documentation pages, and loads them into the knowledge base. @@ -228,17 +215,5 @@ async def aread_llms_txt_and_load_knowledge(self, url: str) -> str: return "Knowledge base not provided" log_info(f"Reading llms.txt from {url}") - documents: List[Document] = await self.reader.async_read(url=url) - - if not documents: - return f"No documents found in llms.txt at {url}" - - log_debug(f"Loading {len(documents)} documents into knowledge base") - for doc in documents: - await self.knowledge.ainsert( - text_content=doc.content, - name=doc.name, - metadata=doc.meta_data, - ) - - return f"Successfully loaded {len(documents)} documents from llms.txt into the knowledge base" + await self.knowledge.ainsert(url=url, reader=self.reader) + return f"Successfully loaded documentation from {url} into the knowledge base" diff --git a/libs/agno/tests/unit/tools/test_llms_txt.py b/libs/agno/tests/unit/tools/test_llms_txt.py index b3cce25d04..1740cd5488 100644 --- a/libs/agno/tests/unit/tools/test_llms_txt.py +++ b/libs/agno/tests/unit/tools/test_llms_txt.py @@ -377,48 +377,23 @@ def test_returns_error_on_fetch_failure(self): class TestLoadKnowledge: - def test_inserts_into_knowledge(self): + def test_delegates_to_knowledge_insert(self): mock_knowledge = MagicMock() tools = LLMsTxtTools(knowledge=mock_knowledge) - mock_response = MagicMock() - mock_response.headers = {"content-type": "text/plain"} - mock_response.text = "Page content" - mock_response.raise_for_status = MagicMock() + tools.read_llms_txt_and_load_knowledge("https://example.com/llms.txt") - # Simple llms.txt with one link - llms_content = "# Test\n\n## Docs\n\n- [Page](https://example.com/page): A page\n" - call_count = 0 - - def mock_get(url, **kwargs): - nonlocal call_count - call_count += 1 - resp = MagicMock() - resp.headers = {"content-type": "text/plain"} - resp.raise_for_status = MagicMock() - if call_count == 1: - resp.text = llms_content - else: - resp.text = "Page content" - return resp - - with patch("httpx.get", side_effect=mock_get): - result = tools.read_llms_txt_and_load_knowledge("https://example.com/llms.txt") - - assert mock_knowledge.insert.called - assert "Successfully loaded" in result + mock_knowledge.insert.assert_called_once_with(url="https://example.com/llms.txt", reader=tools.reader) def test_returns_message_when_no_knowledge(self): tools = LLMsTxtTools() - # Force-call the knowledge method even though it wouldn't be registered result = tools.read_llms_txt_and_load_knowledge("https://example.com/llms.txt") assert result == "Knowledge base not provided" - def test_returns_message_when_no_docs(self): + def test_returns_success_message(self): mock_knowledge = MagicMock() tools = LLMsTxtTools(knowledge=mock_knowledge) - with patch("httpx.get", side_effect=httpx.RequestError("connection failed")): - result = tools.read_llms_txt_and_load_knowledge("https://example.com/llms.txt") + result = tools.read_llms_txt_and_load_knowledge("https://example.com/llms.txt") - assert "No documents found" in result + assert "Successfully loaded" in result From 8bb61d77a13a5a55fa4e10b6b3c746235d9c58ee Mon Sep 17 00:00:00 2001 From: Mustafa Esoofally Date: Fri, 10 Apr 2026 11:30:15 -0400 Subject: [PATCH 05/23] fix: simplify reader, delegate to Knowledge pipeline, remove dead code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reader: - Remove redundant state: in_optional and past_first_section replaced by single current_section variable - Remove dead if/else branch on proxy — httpx accepts proxy=None - Remove WHAT comments that restate the next line - Simplify AsyncClient construction (proxy=self.proxy directly) Toolkit: - Extract _format_index helper to deduplicate sync/async index building - Delegate knowledge loading to Knowledge.insert(url=, reader=) pipeline Knowledge: - Skip pre-download when custom reader is provided — URL-based readers like LLMsTxtReader need the URL string, not pre-fetched BytesIO --- libs/agno/agno/knowledge/knowledge.py | 8 ++- .../agno/knowledge/reader/llms_txt_reader.py | 33 ++---------- libs/agno/agno/tools/llms_txt.py | 50 ++++++++----------- libs/agno/tests/unit/tools/test_llms_txt.py | 11 +--- 4 files changed, 33 insertions(+), 69 deletions(-) diff --git a/libs/agno/agno/knowledge/knowledge.py b/libs/agno/agno/knowledge/knowledge.py index 200018d1b4..dd01927627 100644 --- a/libs/agno/agno/knowledge/knowledge.py +++ b/libs/agno/agno/knowledge/knowledge.py @@ -1564,7 +1564,9 @@ async def _aload_from_url( file_extension = url_path.suffix.lower() bytes_content = None - if file_extension: + # Skip pre-download when a custom reader is provided — it knows how to + # handle the URL directly (e.g. LLMsTxtReader fetches linked pages) + if file_extension and not content.reader: async with AsyncClient() as client: response = await async_fetch_with_retry(content.url, client=client) bytes_content = BytesIO(response.content) @@ -1716,7 +1718,9 @@ def _load_from_url( file_extension = url_path.suffix.lower() bytes_content = None - if file_extension: + # Skip pre-download when a custom reader is provided — it knows how to + # handle the URL directly (e.g. LLMsTxtReader fetches linked pages) + if file_extension and not content.reader: response = fetch_with_retry(content.url) bytes_content = BytesIO(response.content) diff --git a/libs/agno/agno/knowledge/reader/llms_txt_reader.py b/libs/agno/agno/knowledge/reader/llms_txt_reader.py index 138c69f815..839441b8e1 100644 --- a/libs/agno/agno/knowledge/reader/llms_txt_reader.py +++ b/libs/agno/agno/knowledge/reader/llms_txt_reader.py @@ -106,29 +106,21 @@ def parse_llms_txt(self, content: str, base_url: str) -> Tuple[str, List[LLMsTxt """ entries: List[LLMsTxtEntry] = [] current_section = "" - in_optional = False - - lines = content.split("\n") overview_lines: List[str] = [] - past_first_section = False - for line in lines: - # Check for section headers + for line in content.split("\n"): section_match = _SECTION_PATTERN.match(line) if section_match: current_section = section_match.group(1).strip() - past_first_section = True - in_optional = current_section.lower() == "optional" continue - if not past_first_section: + if not current_section: overview_lines.append(line) continue - if self.skip_optional and in_optional: + if self.skip_optional and current_section.lower() == "optional": continue - # Check for links link_match = _LINK_PATTERN.match(line.strip()) if link_match: title = link_match.group(1).strip() @@ -189,10 +181,7 @@ def fetch_url(self, url: str) -> Optional[str]: """Fetch content from a URL, returning text for text-like content or extracted text from HTML.""" try: log_debug(f"Fetching: {url}") - if self.proxy: - response = httpx.get(url, timeout=self.timeout, proxy=self.proxy, follow_redirects=True) - else: - response = httpx.get(url, timeout=self.timeout, follow_redirects=True) + response = httpx.get(url, timeout=self.timeout, proxy=self.proxy, follow_redirects=True) response.raise_for_status() return self._process_response(response.headers.get("content-type", ""), response.text) except httpx.HTTPStatusError as e: @@ -284,23 +273,18 @@ def read(self, url: str, name: Optional[str] = None) -> List[Document]: A list of documents from the llms.txt and all linked pages. """ log_debug(f"Reading llms.txt: {url}") - - # Fetch the llms.txt file llms_txt_content = self.fetch_url(url) if not llms_txt_content: log_error(f"Failed to fetch llms.txt from {url}") return [] - # Parse the llms.txt content overview, entries = self.parse_llms_txt(llms_txt_content, url) log_debug(f"Found {len(entries)} linked URLs in llms.txt") - # Limit the number of URLs to fetch entries_to_fetch = entries[: self.max_urls] if len(entries) > self.max_urls: log_warning(f"Limiting to {self.max_urls} URLs (found {len(entries)})") - # Fetch all linked pages fetched: Dict[str, str] = {} for entry in entries_to_fetch: content = self.fetch_url(entry.url) @@ -321,25 +305,18 @@ async def async_read(self, url: str, name: Optional[str] = None) -> List[Documen A list of documents from the llms.txt and all linked pages. """ log_debug(f"Reading llms.txt asynchronously: {url}") - - client_args = {"proxy": self.proxy} if self.proxy else {} - async with httpx.AsyncClient(**client_args) as client: # type: ignore - # Fetch the llms.txt file + async with httpx.AsyncClient(proxy=self.proxy) as client: llms_txt_content = await self.async_fetch_url(client, url) if not llms_txt_content: log_error(f"Failed to fetch llms.txt from {url}") return [] - # Parse the llms.txt content overview, entries = self.parse_llms_txt(llms_txt_content, url) log_debug(f"Found {len(entries)} linked URLs in llms.txt") - # Limit the number of URLs to fetch entries_to_fetch = entries[: self.max_urls] if len(entries) > self.max_urls: log_warning(f"Limiting to {self.max_urls} URLs (found {len(entries)})") - - # Fetch all linked pages concurrently with a semaphore to limit parallelism semaphore = asyncio.Semaphore(_MAX_CONCURRENT_FETCHES) async def _fetch_entry(entry: LLMsTxtEntry) -> Tuple[str, Optional[str]]: diff --git a/libs/agno/agno/tools/llms_txt.py b/libs/agno/agno/tools/llms_txt.py index 8fa8375d2c..1294198355 100644 --- a/libs/agno/agno/tools/llms_txt.py +++ b/libs/agno/agno/tools/llms_txt.py @@ -82,6 +82,24 @@ def _async_client_kwargs(self) -> Dict[str, Any]: kwargs["proxy"] = self.reader.proxy return kwargs + def _format_index(self, overview: str, entries: list) -> str: + """Build JSON index response from parsed llms.txt data.""" + return json.dumps( + { + "overview": overview, + "pages": [ + { + "title": e.title, + "url": e.url, + "description": e.description, + "section": e.section, + } + for e in entries + ], + "total_pages": len(entries), + } + ) + def get_llms_txt_index(self, url: str) -> str: """Reads an llms.txt file and returns the index of all available documentation pages. @@ -99,21 +117,7 @@ def get_llms_txt_index(self, url: str) -> str: return f"Failed to fetch llms.txt from {url}" overview, entries = self.reader.parse_llms_txt(llms_txt_content, url) - - index = { - "overview": overview, - "pages": [ - { - "title": entry.title, - "url": entry.url, - "description": entry.description, - "section": entry.section, - } - for entry in entries - ], - "total_pages": len(entries), - } - return json.dumps(index) + return self._format_index(overview, entries) async def aget_llms_txt_index(self, url: str) -> str: """Reads an llms.txt file and returns the index of all available documentation pages. @@ -134,21 +138,7 @@ async def aget_llms_txt_index(self, url: str) -> str: return f"Failed to fetch llms.txt from {url}" overview, entries = self.reader.parse_llms_txt(llms_txt_content, url) - - index = { - "overview": overview, - "pages": [ - { - "title": entry.title, - "url": entry.url, - "description": entry.description, - "section": entry.section, - } - for entry in entries - ], - "total_pages": len(entries), - } - return json.dumps(index) + return self._format_index(overview, entries) def read_llms_txt_url(self, url: str) -> str: """Fetches and returns the content of a specific documentation page URL. diff --git a/libs/agno/tests/unit/tools/test_llms_txt.py b/libs/agno/tests/unit/tools/test_llms_txt.py index 1740cd5488..500e0e3fcc 100644 --- a/libs/agno/tests/unit/tools/test_llms_txt.py +++ b/libs/agno/tests/unit/tools/test_llms_txt.py @@ -381,19 +381,12 @@ def test_delegates_to_knowledge_insert(self): mock_knowledge = MagicMock() tools = LLMsTxtTools(knowledge=mock_knowledge) - tools.read_llms_txt_and_load_knowledge("https://example.com/llms.txt") + result = tools.read_llms_txt_and_load_knowledge("https://example.com/llms.txt") mock_knowledge.insert.assert_called_once_with(url="https://example.com/llms.txt", reader=tools.reader) + assert "Successfully loaded" in result def test_returns_message_when_no_knowledge(self): tools = LLMsTxtTools() result = tools.read_llms_txt_and_load_knowledge("https://example.com/llms.txt") assert result == "Knowledge base not provided" - - def test_returns_success_message(self): - mock_knowledge = MagicMock() - tools = LLMsTxtTools(knowledge=mock_knowledge) - - result = tools.read_llms_txt_and_load_knowledge("https://example.com/llms.txt") - - assert "Successfully loaded" in result From 7d88c4457bb9ccc8259cdc8aa11471c5722b6bdf Mon Sep 17 00:00:00 2001 From: Mustafa Esoofally Date: Fri, 10 Apr 2026 11:35:53 -0400 Subject: [PATCH 06/23] =?UTF-8?q?fix:=20remove=20include=5Fllms=5Ftxt=5Fco?= =?UTF-8?q?ntent=20parameter=20=E2=80=94=20always=20include=20overview?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The overview document (title + summary from the llms.txt) provides essential context about the project. No caller ever set this to False. Removing the parameter and its branch simplifies the reader. --- libs/agno/agno/knowledge/reader/llms_txt_reader.py | 6 +----- libs/agno/tests/unit/tools/test_llms_txt.py | 14 -------------- 2 files changed, 1 insertion(+), 19 deletions(-) diff --git a/libs/agno/agno/knowledge/reader/llms_txt_reader.py b/libs/agno/agno/knowledge/reader/llms_txt_reader.py index 839441b8e1..2eaac0896f 100644 --- a/libs/agno/agno/knowledge/reader/llms_txt_reader.py +++ b/libs/agno/agno/knowledge/reader/llms_txt_reader.py @@ -56,7 +56,6 @@ def __init__( max_urls: int = 100, timeout: int = 30, proxy: Optional[str] = None, - include_llms_txt_content: bool = True, skip_optional: bool = False, **kwargs, ): @@ -67,7 +66,6 @@ def __init__( max_urls: Maximum number of linked URLs to fetch. Defaults to 100. timeout: HTTP request timeout in seconds. Defaults to 30. proxy: Optional HTTP proxy URL. - include_llms_txt_content: Whether to include the llms.txt file itself as a document. skip_optional: Whether to skip URLs in the "Optional" section. """ if chunking_strategy is None: @@ -77,7 +75,6 @@ def __init__( self.max_urls = max_urls self.timeout = timeout self.proxy = proxy - self.include_llms_txt_content = include_llms_txt_content self.skip_optional = skip_optional @classmethod @@ -222,8 +219,7 @@ def _build_documents( """Build Document list from fetched content.""" documents: List[Document] = [] - # Optionally include the llms.txt overview as a document - if self.include_llms_txt_content and overview: + if overview: doc = Document( name=name or llms_txt_url, id=str(uuid.uuid4()), diff --git a/libs/agno/tests/unit/tools/test_llms_txt.py b/libs/agno/tests/unit/tools/test_llms_txt.py index 500e0e3fcc..7857341a27 100644 --- a/libs/agno/tests/unit/tools/test_llms_txt.py +++ b/libs/agno/tests/unit/tools/test_llms_txt.py @@ -60,7 +60,6 @@ def test_defaults(self): assert reader.max_urls == 100 assert reader.timeout == 30 assert reader.proxy is None - assert reader.include_llms_txt_content is True assert reader.skip_optional is False def test_custom_params(self): @@ -229,19 +228,6 @@ def test_skips_unfetched_entries(self): # Only the overview doc assert len(docs) == 1 - def test_excludes_overview_when_disabled(self): - reader = LLMsTxtReader(chunk=False, include_llms_txt_content=False) - entries = [ - LLMsTxtEntry(title="Page", url="https://example.com/page", description="", section="Docs"), - ] - fetched = {"https://example.com/page": "Page content"} - - docs = reader._build_documents("Overview", entries, fetched, "https://example.com/llms.txt", None) - - assert len(docs) == 1 - assert docs[0].meta_data["type"] == "llms_txt_linked_doc" - - class TestRead: def test_read_fetches_and_builds_docs(self): reader = LLMsTxtReader(max_urls=5, chunk=False) From 4474d950a474542610b76ba09325f08f2d827190 Mon Sep 17 00:00:00 2001 From: Mustafa Esoofally Date: Fri, 10 Apr 2026 11:44:39 -0400 Subject: [PATCH 07/23] =?UTF-8?q?fix:=20clean=20up=20reader=20=E2=80=94=20?= =?UTF-8?q?remove=20init=20docstring,=20simplify=20parser?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove __init__ docstring (no other reader has one) - Rewrite parse_llms_txt: replace 3 continue statements with clean if/elif/else chain — each line falls into one bucket - Remove include_llms_txt_content param (always True, never exposed) --- .../agno/knowledge/reader/llms_txt_reader.py | 50 ++++++------------- 1 file changed, 16 insertions(+), 34 deletions(-) diff --git a/libs/agno/agno/knowledge/reader/llms_txt_reader.py b/libs/agno/agno/knowledge/reader/llms_txt_reader.py index 2eaac0896f..ce4c95254e 100644 --- a/libs/agno/agno/knowledge/reader/llms_txt_reader.py +++ b/libs/agno/agno/knowledge/reader/llms_txt_reader.py @@ -59,15 +59,6 @@ def __init__( skip_optional: bool = False, **kwargs, ): - """Initialize the LLMsTxtReader. - - Args: - chunking_strategy: Strategy for chunking documents. - max_urls: Maximum number of linked URLs to fetch. Defaults to 100. - timeout: HTTP request timeout in seconds. Defaults to 30. - proxy: Optional HTTP proxy URL. - skip_optional: Whether to skip URLs in the "Optional" section. - """ if chunking_strategy is None: chunk_size = kwargs.get("chunk_size", 5000) chunking_strategy = FixedSizeChunking(chunk_size=chunk_size) @@ -109,33 +100,24 @@ def parse_llms_txt(self, content: str, base_url: str) -> Tuple[str, List[LLMsTxt section_match = _SECTION_PATTERN.match(line) if section_match: current_section = section_match.group(1).strip() - continue - - if not current_section: + elif not current_section: overview_lines.append(line) - continue - - if self.skip_optional and current_section.lower() == "optional": - continue - - link_match = _LINK_PATTERN.match(line.strip()) - if link_match: - title = link_match.group(1).strip() - url = link_match.group(2).strip() - description = (link_match.group(3) or "").strip() - - # Resolve relative URLs - if not url.startswith(("http://", "https://")): - url = urljoin(base_url, url) - - entries.append( - LLMsTxtEntry( - title=title, - url=url, - description=description, - section=current_section, + elif self.skip_optional and current_section.lower() == "optional": + pass + else: + link_match = _LINK_PATTERN.match(line.strip()) + if link_match: + url = link_match.group(2).strip() + if not url.startswith(("http://", "https://")): + url = urljoin(base_url, url) + entries.append( + LLMsTxtEntry( + title=link_match.group(1).strip(), + url=url, + description=(link_match.group(3) or "").strip(), + section=current_section, + ) ) - ) overview = "\n".join(overview_lines).strip() return overview, entries From 9039720decc5d04ea9d0b11ef228c51093f32c33 Mon Sep 17 00:00:00 2001 From: Mustafa Esoofally Date: Fri, 10 Apr 2026 11:46:12 -0400 Subject: [PATCH 08/23] fix: inline _extract_content into _process_response MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _extract_content was called exactly once. Inlining removes one indirection layer — the reader now has only the helpers that are actually shared between read() and async_read(). --- .../agno/knowledge/reader/llms_txt_reader.py | 45 ++++++++----------- libs/agno/tests/unit/tools/test_llms_txt.py | 10 ++--- 2 files changed, 24 insertions(+), 31 deletions(-) diff --git a/libs/agno/agno/knowledge/reader/llms_txt_reader.py b/libs/agno/agno/knowledge/reader/llms_txt_reader.py index ce4c95254e..70866196e2 100644 --- a/libs/agno/agno/knowledge/reader/llms_txt_reader.py +++ b/libs/agno/agno/knowledge/reader/llms_txt_reader.py @@ -122,37 +122,30 @@ def parse_llms_txt(self, content: str, base_url: str) -> Tuple[str, List[LLMsTxt overview = "\n".join(overview_lines).strip() return overview, entries - def _extract_content(self, html: str) -> str: - """Extract readable text content from HTML.""" - try: - from bs4 import BeautifulSoup - except ImportError: - raise ImportError("The `bs4` package is not installed. Please install it via `pip install beautifulsoup4`.") - - soup = BeautifulSoup(html, "html.parser") - - # Remove unwanted elements - for tag in soup.find_all(["script", "style", "nav", "header", "footer", "aside"]): - tag.decompose() - - # Try to find main content - main = soup.find("main") or soup.find("article") or soup.find(attrs={"role": "main"}) - if main: - return main.get_text(separator="\n", strip=True) - - body = soup.find("body") - if body: - return body.get_text(separator="\n", strip=True) - - return soup.get_text(separator="\n", strip=True) - def _process_response(self, content_type: str, text: str) -> str: - """Classify an HTTP response by content-type and return processed text.""" + """Classify an HTTP response by content-type and extract text.""" if any(t in content_type for t in ["text/plain", "text/markdown"]): return text if "text/html" in content_type or text.strip().startswith(("
Main content here
Foot
" - result = reader._extract_content(html) + result = reader._process_response("text/html",html) assert "Main content here" in result assert "Nav" not in result def test_extracts_from_body_fallback(self): reader = LLMsTxtReader() html = "
Body content
" - result = reader._extract_content(html) + result = reader._process_response("text/html",html) assert "Body content" in result def test_strips_script_and_style(self): reader = LLMsTxtReader() html = "

Text

" - result = reader._extract_content(html) + result = reader._process_response("text/html",html) assert "var x" not in result assert "Text" in result def test_preserves_structure_with_newlines(self): reader = LLMsTxtReader() html = "

First paragraph

Second paragraph

" - result = reader._extract_content(html) + result = reader._process_response("text/html",html) assert "First paragraph" in result assert "Second paragraph" in result assert "\n" in result From d6becc8f8df6e0eaa7fb2a3d710ab132c5923199 Mon Sep 17 00:00:00 2001 From: Mustafa Esoofally Date: Fri, 10 Apr 2026 11:48:51 -0400 Subject: [PATCH 09/23] =?UTF-8?q?fix:=20simplify=20fetch=5Furl=20=E2=80=94?= =?UTF-8?q?=20collapse=203=20except=20blocks=20into=201?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 3-way exception split (HTTPStatusError, RequestError, Exception) was duplicated between sync and async. For a reader fetching doc pages, a single catch with a warning log is sufficient. Each method is now 4 lines instead of 12. --- .../agno/knowledge/reader/llms_txt_reader.py | 22 ++++--------------- 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/libs/agno/agno/knowledge/reader/llms_txt_reader.py b/libs/agno/agno/knowledge/reader/llms_txt_reader.py index 70866196e2..07aedf1584 100644 --- a/libs/agno/agno/knowledge/reader/llms_txt_reader.py +++ b/libs/agno/agno/knowledge/reader/llms_txt_reader.py @@ -150,37 +150,23 @@ def _process_response(self, content_type: str, text: str) -> str: return text def fetch_url(self, url: str) -> Optional[str]: - """Fetch content from a URL, returning text for text-like content or extracted text from HTML.""" + """Fetch a URL and return its text content, or None on failure.""" try: - log_debug(f"Fetching: {url}") response = httpx.get(url, timeout=self.timeout, proxy=self.proxy, follow_redirects=True) response.raise_for_status() return self._process_response(response.headers.get("content-type", ""), response.text) - except httpx.HTTPStatusError as e: - log_warning(f"HTTP error fetching {url}: {e.response.status_code}") - return None - except httpx.RequestError as e: - log_warning(f"Request error fetching {url}: {str(e)}") - return None except Exception as e: - log_error(f"Failed to fetch {url}: {str(e)}") + log_warning(f"Failed to fetch {url}: {e}") return None async def async_fetch_url(self, client: httpx.AsyncClient, url: str) -> Optional[str]: - """Asynchronously fetch content from a URL.""" + """Async variant of fetch_url using a shared client.""" try: - log_debug(f"Fetching asynchronously: {url}") response = await client.get(url, timeout=self.timeout, follow_redirects=True) response.raise_for_status() return self._process_response(response.headers.get("content-type", ""), response.text) - except httpx.HTTPStatusError as e: - log_warning(f"HTTP error fetching {url}: {e.response.status_code}") - return None - except httpx.RequestError as e: - log_warning(f"Request error fetching {url}: {str(e)}") - return None except Exception as e: - log_error(f"Failed to fetch {url}: {str(e)}") + log_warning(f"Failed to fetch {url}: {e}") return None def _build_documents( From 5ed981b403dd63409ad8f834bbcc61aa69e45243 Mon Sep 17 00:00:00 2001 From: Mustafa Esoofally Date: Fri, 10 Apr 2026 11:57:23 -0400 Subject: [PATCH 10/23] fix: remove module-level constant, inline semaphore with WHY comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Keep the semaphore (Codex confirms: this is external HTTP fan-out, not local processing — unbounded gather would burst 100 requests at once). Remove _MAX_CONCURRENT_FETCHES constant, inline the value with a comment explaining why it exists. --- libs/agno/agno/knowledge/reader/llms_txt_reader.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/libs/agno/agno/knowledge/reader/llms_txt_reader.py b/libs/agno/agno/knowledge/reader/llms_txt_reader.py index 07aedf1584..3b713390fc 100644 --- a/libs/agno/agno/knowledge/reader/llms_txt_reader.py +++ b/libs/agno/agno/knowledge/reader/llms_txt_reader.py @@ -20,9 +20,6 @@ # Pattern to match H2 section headers _SECTION_PATTERN = re.compile(r"^##\s+(.+)$", re.MULTILINE) -# Maximum number of concurrent HTTP requests when fetching linked pages -_MAX_CONCURRENT_FETCHES = 10 - @dataclass class LLMsTxtEntry: @@ -274,7 +271,9 @@ async def async_read(self, url: str, name: Optional[str] = None) -> List[Documen entries_to_fetch = entries[: self.max_urls] if len(entries) > self.max_urls: log_warning(f"Limiting to {self.max_urls} URLs (found {len(entries)})") - semaphore = asyncio.Semaphore(_MAX_CONCURRENT_FETCHES) + # httpx pool limits handle per-host connections, but we also cap total + # in-flight fetches to avoid bursting 100 requests at third-party servers + semaphore = asyncio.Semaphore(10) async def _fetch_entry(entry: LLMsTxtEntry) -> Tuple[str, Optional[str]]: async with semaphore: From a5fe2a386770ce0bd8a5f8687449ed9249e38a7a Mon Sep 17 00:00:00 2001 From: Mustafa Esoofally Date: Fri, 10 Apr 2026 12:01:11 -0400 Subject: [PATCH 11/23] fix: reuse fetch_with_retry utils instead of raw httpx calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add timeout and follow_redirects params to existing fetch_with_retry and async_fetch_with_retry in utils/http.py. Reader now uses these shared utils instead of making raw httpx.get calls — retry logic, error handling, and connection management in one place. Removed semaphore — httpx AsyncClient already limits concurrent connections per host (default 20). --- .../agno/knowledge/reader/llms_txt_reader.py | 25 +++++++++++-------- libs/agno/agno/utils/http.py | 19 ++++++++------ libs/agno/tests/unit/tools/test_llms_txt.py | 9 ++++--- 3 files changed, 31 insertions(+), 22 deletions(-) diff --git a/libs/agno/agno/knowledge/reader/llms_txt_reader.py b/libs/agno/agno/knowledge/reader/llms_txt_reader.py index 3b713390fc..5feb87acea 100644 --- a/libs/agno/agno/knowledge/reader/llms_txt_reader.py +++ b/libs/agno/agno/knowledge/reader/llms_txt_reader.py @@ -12,6 +12,7 @@ from agno.knowledge.document.base import Document from agno.knowledge.reader.base import Reader from agno.knowledge.types import ContentType +from agno.utils.http import async_fetch_with_retry, fetch_with_retry from agno.utils.log import log_debug, log_error, log_warning # Pattern to match markdown links: - [Title](url) or - [Title](url): description @@ -128,7 +129,9 @@ def _process_response(self, content_type: str, text: str) -> str: try: from bs4 import BeautifulSoup except ImportError: - raise ImportError("The `bs4` package is not installed. Please install it via `pip install beautifulsoup4`.") + raise ImportError( + "The `bs4` package is not installed. Please install it via `pip install beautifulsoup4`." + ) soup = BeautifulSoup(text, "html.parser") for tag in soup.find_all(["script", "style", "nav", "header", "footer", "aside"]): @@ -149,8 +152,9 @@ def _process_response(self, content_type: str, text: str) -> str: def fetch_url(self, url: str) -> Optional[str]: """Fetch a URL and return its text content, or None on failure.""" try: - response = httpx.get(url, timeout=self.timeout, proxy=self.proxy, follow_redirects=True) - response.raise_for_status() + response = fetch_with_retry( + url, max_retries=1, proxy=self.proxy, timeout=self.timeout, follow_redirects=True + ) return self._process_response(response.headers.get("content-type", ""), response.text) except Exception as e: log_warning(f"Failed to fetch {url}: {e}") @@ -159,8 +163,9 @@ def fetch_url(self, url: str) -> Optional[str]: async def async_fetch_url(self, client: httpx.AsyncClient, url: str) -> Optional[str]: """Async variant of fetch_url using a shared client.""" try: - response = await client.get(url, timeout=self.timeout, follow_redirects=True) - response.raise_for_status() + response = await async_fetch_with_retry( + url, client=client, max_retries=1, timeout=self.timeout, follow_redirects=True + ) return self._process_response(response.headers.get("content-type", ""), response.text) except Exception as e: log_warning(f"Failed to fetch {url}: {e}") @@ -271,14 +276,12 @@ async def async_read(self, url: str, name: Optional[str] = None) -> List[Documen entries_to_fetch = entries[: self.max_urls] if len(entries) > self.max_urls: log_warning(f"Limiting to {self.max_urls} URLs (found {len(entries)})") - # httpx pool limits handle per-host connections, but we also cap total - # in-flight fetches to avoid bursting 100 requests at third-party servers - semaphore = asyncio.Semaphore(10) + # httpx AsyncClient limits concurrent connections per host (default 20), + # so we don't need application-level throttling async def _fetch_entry(entry: LLMsTxtEntry) -> Tuple[str, Optional[str]]: - async with semaphore: - content = await self.async_fetch_url(client, entry.url) - return entry.url, content + content = await self.async_fetch_url(client, entry.url) + return entry.url, content results = await asyncio.gather(*[_fetch_entry(e) for e in entries_to_fetch]) fetched: Dict[str, str] = {entry_url: content for entry_url, content in results if content} diff --git a/libs/agno/agno/utils/http.py b/libs/agno/agno/utils/http.py index ca887b3e83..053767033b 100644 --- a/libs/agno/agno/utils/http.py +++ b/libs/agno/agno/utils/http.py @@ -179,12 +179,16 @@ def fetch_with_retry( max_retries: int = DEFAULT_MAX_RETRIES, backoff_factor: int = DEFAULT_BACKOFF_FACTOR, proxy: Optional[str] = None, + timeout: Optional[int] = None, + follow_redirects: bool = False, ) -> httpx.Response: """Synchronous HTTP GET with retry logic.""" for attempt in range(max_retries): try: - response = httpx.get(url, proxy=proxy) if proxy else httpx.get(url) + response = httpx.get( + url, proxy=proxy, follow_redirects=follow_redirects, timeout=timeout # type: ignore[arg-type] + ) response.raise_for_status() return response except httpx.RequestError as e: @@ -198,7 +202,7 @@ def fetch_with_retry( logger.exception(f"HTTP error for {url}: {e.response.status_code} - {e.response.text}") raise - raise httpx.RequestError(f"Failed to fetch {url} after {max_retries} attempts") + raise httpx.RequestError(f"Failed to fetch {url} after {max_retries} attempts") # type: ignore[call-arg] async def async_fetch_with_retry( @@ -207,16 +211,17 @@ async def async_fetch_with_retry( max_retries: int = DEFAULT_MAX_RETRIES, backoff_factor: int = DEFAULT_BACKOFF_FACTOR, proxy: Optional[str] = None, + timeout: Optional[int] = None, + follow_redirects: bool = False, ) -> httpx.Response: """Asynchronous HTTP GET with retry logic.""" async def _fetch(): if client is None: - client_args = {"proxy": proxy} if proxy else {} - async with httpx.AsyncClient(**client_args) as local_client: # type: ignore - return await local_client.get(url) + async with httpx.AsyncClient(proxy=proxy) as local_client: + return await local_client.get(url, follow_redirects=follow_redirects, timeout=timeout) # type: ignore[arg-type] else: - return await client.get(url) + return await client.get(url, follow_redirects=follow_redirects, timeout=timeout) # type: ignore[arg-type] for attempt in range(max_retries): try: @@ -234,4 +239,4 @@ async def _fetch(): logger.exception(f"HTTP error for {url}: {e.response.status_code} - {e.response.text}") raise - raise httpx.RequestError(f"Failed to fetch {url} after {max_retries} attempts") + raise httpx.RequestError(f"Failed to fetch {url} after {max_retries} attempts") # type: ignore[call-arg] diff --git a/libs/agno/tests/unit/tools/test_llms_txt.py b/libs/agno/tests/unit/tools/test_llms_txt.py index f4939b7e3b..0346f06c5f 100644 --- a/libs/agno/tests/unit/tools/test_llms_txt.py +++ b/libs/agno/tests/unit/tools/test_llms_txt.py @@ -128,27 +128,27 @@ class TestProcessResponse: def test_extracts_from_main_tag(self): reader = LLMsTxtReader() html = "
Main content here
Foot
" - result = reader._process_response("text/html",html) + result = reader._process_response("text/html", html) assert "Main content here" in result assert "Nav" not in result def test_extracts_from_body_fallback(self): reader = LLMsTxtReader() html = "
Body content
" - result = reader._process_response("text/html",html) + result = reader._process_response("text/html", html) assert "Body content" in result def test_strips_script_and_style(self): reader = LLMsTxtReader() html = "

Text

" - result = reader._process_response("text/html",html) + result = reader._process_response("text/html", html) assert "var x" not in result assert "Text" in result def test_preserves_structure_with_newlines(self): reader = LLMsTxtReader() html = "

First paragraph

Second paragraph

" - result = reader._process_response("text/html",html) + result = reader._process_response("text/html", html) assert "First paragraph" in result assert "Second paragraph" in result assert "\n" in result @@ -228,6 +228,7 @@ def test_skips_unfetched_entries(self): # Only the overview doc assert len(docs) == 1 + class TestRead: def test_read_fetches_and_builds_docs(self): reader = LLMsTxtReader(max_urls=5, chunk=False) From 62e75c0cc9c9625b558a1979cfcd1ff42113a348 Mon Sep 17 00:00:00 2001 From: Mustafa Esoofally Date: Fri, 10 Apr 2026 12:03:18 -0400 Subject: [PATCH 12/23] =?UTF-8?q?fix:=20change=20defaults=20=E2=80=94=20ma?= =?UTF-8?q?x=5Furls=3D20,=20timeout=3D60?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit max_urls=100 was too high — would overwhelm model context in agentic mode. 20 matches the knowledge cookbook and WebsiteReader's max_links=10 ballpark. timeout=60 matches the global httpx client default. --- libs/agno/agno/knowledge/reader/llms_txt_reader.py | 6 +++--- libs/agno/agno/tools/llms_txt.py | 8 ++++---- libs/agno/tests/unit/tools/test_llms_txt.py | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/libs/agno/agno/knowledge/reader/llms_txt_reader.py b/libs/agno/agno/knowledge/reader/llms_txt_reader.py index 5feb87acea..dd6e22e430 100644 --- a/libs/agno/agno/knowledge/reader/llms_txt_reader.py +++ b/libs/agno/agno/knowledge/reader/llms_txt_reader.py @@ -44,15 +44,15 @@ class LLMsTxtReader(Reader): - H2-delimited sections containing markdown links to documentation pages Example: - reader = LLMsTxtReader(max_urls=50) + reader = LLMsTxtReader(max_urls=20) documents = reader.read("https://docs.example.com/llms.txt") """ def __init__( self, chunking_strategy: Optional[ChunkingStrategy] = None, - max_urls: int = 100, - timeout: int = 30, + max_urls: int = 20, + timeout: int = 60, proxy: Optional[str] = None, skip_optional: bool = False, **kwargs, diff --git a/libs/agno/agno/tools/llms_txt.py b/libs/agno/agno/tools/llms_txt.py index 1294198355..ecbd05dbd2 100644 --- a/libs/agno/agno/tools/llms_txt.py +++ b/libs/agno/agno/tools/llms_txt.py @@ -27,8 +27,8 @@ class LLMsTxtTools(Toolkit): Args: knowledge: Optional Knowledge instance. When provided, enables knowledge loading mode. - max_urls: Maximum number of linked URLs to fetch when loading into knowledge. Defaults to 100. - timeout: HTTP request timeout in seconds. Defaults to 30. + max_urls: Maximum number of linked URLs to fetch when loading into knowledge. Defaults to 20. + timeout: HTTP request timeout in seconds. Defaults to 60. skip_optional: Whether to skip URLs listed in the "Optional" section. Defaults to False. Example: @@ -45,8 +45,8 @@ class LLMsTxtTools(Toolkit): def __init__( self, knowledge: Optional[Knowledge] = None, - max_urls: int = 100, - timeout: int = 30, + max_urls: int = 20, + timeout: int = 60, skip_optional: bool = False, **kwargs, ): diff --git a/libs/agno/tests/unit/tools/test_llms_txt.py b/libs/agno/tests/unit/tools/test_llms_txt.py index 0346f06c5f..edfb6023c9 100644 --- a/libs/agno/tests/unit/tools/test_llms_txt.py +++ b/libs/agno/tests/unit/tools/test_llms_txt.py @@ -57,8 +57,8 @@ class TestLLMsTxtReaderInit: def test_defaults(self): reader = LLMsTxtReader() - assert reader.max_urls == 100 - assert reader.timeout == 30 + assert reader.max_urls == 20 + assert reader.timeout == 60 assert reader.proxy is None assert reader.skip_optional is False From 1d8312ffb045f24fe23e7aca9ad53578cf352023 Mon Sep 17 00:00:00 2001 From: Mustafa Esoofally Date: Fri, 10 Apr 2026 12:04:23 -0400 Subject: [PATCH 13/23] =?UTF-8?q?fix:=20move=20imports=20to=20module=20lev?= =?UTF-8?q?el=20=E2=80=94=20bs4=20and=20LLMsTxtReader?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bs4 import now fails at import time (matching WebsiteReader and WebSearchReader pattern) instead of deep inside a fetch call. LLMsTxtReader import moved to top of toolkit — no reason to defer an internal agno module. --- libs/agno/agno/knowledge/reader/llms_txt_reader.py | 12 +++++------- libs/agno/agno/tools/llms_txt.py | 3 +-- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/libs/agno/agno/knowledge/reader/llms_txt_reader.py b/libs/agno/agno/knowledge/reader/llms_txt_reader.py index dd6e22e430..8689ff7805 100644 --- a/libs/agno/agno/knowledge/reader/llms_txt_reader.py +++ b/libs/agno/agno/knowledge/reader/llms_txt_reader.py @@ -7,6 +7,11 @@ import httpx +try: + from bs4 import BeautifulSoup # noqa: F401 +except ImportError: + raise ImportError("The `bs4` package is not installed. Please install it via `pip install beautifulsoup4`.") + from agno.knowledge.chunking.fixed import FixedSizeChunking from agno.knowledge.chunking.strategy import ChunkingStrategy, ChunkingStrategyType from agno.knowledge.document.base import Document @@ -126,13 +131,6 @@ def _process_response(self, content_type: str, text: str) -> str: return text if "text/html" in content_type or text.strip().startswith((" Date: Fri, 10 Apr 2026 12:05:45 -0400 Subject: [PATCH 14/23] fix: remove class docstring and WHAT comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Class docstring was a 30-line essay — most toolkits have none. The code structure already shows the two modes (with/without knowledge). Removed remaining WHAT comment in _build_documents. --- .../agno/knowledge/reader/llms_txt_reader.py | 1 - libs/agno/agno/tools/llms_txt.py | 32 ------------------- 2 files changed, 33 deletions(-) diff --git a/libs/agno/agno/knowledge/reader/llms_txt_reader.py b/libs/agno/agno/knowledge/reader/llms_txt_reader.py index 8689ff7805..614c57e91c 100644 --- a/libs/agno/agno/knowledge/reader/llms_txt_reader.py +++ b/libs/agno/agno/knowledge/reader/llms_txt_reader.py @@ -195,7 +195,6 @@ def _build_documents( else: documents.append(doc) - # Add each fetched page as a document for entry in entries: content = fetched.get(entry.url) if not content: diff --git a/libs/agno/agno/tools/llms_txt.py b/libs/agno/agno/tools/llms_txt.py index b21ad5f45a..9c9441fffa 100644 --- a/libs/agno/agno/tools/llms_txt.py +++ b/libs/agno/agno/tools/llms_txt.py @@ -10,38 +10,6 @@ class LLMsTxtTools(Toolkit): - """Tools for reading llms.txt files and loading their linked documentation into a knowledge base. - - The llms.txt format (see https://llmstxt.org) is a standardized way for websites to provide - LLM-friendly documentation indexes. - - This toolkit provides two usage modes: - - **Agentic mode (without knowledge):** The agent gets two tools: - - `get_llms_txt_index` - reads the llms.txt and returns the index of available docs - - `read_llms_txt_url` - fetches a specific URL from the index - The agent reads the index, decides which pages are relevant, and fetches only those. - - **Knowledge mode (with knowledge):** The agent gets one tool: - - `read_llms_txt_and_load_knowledge` - reads the llms.txt, fetches all linked pages, - and loads them into the knowledge base. - - Args: - knowledge: Optional Knowledge instance. When provided, enables knowledge loading mode. - max_urls: Maximum number of linked URLs to fetch when loading into knowledge. Defaults to 20. - timeout: HTTP request timeout in seconds. Defaults to 60. - skip_optional: Whether to skip URLs listed in the "Optional" section. Defaults to False. - - Example: - # Agentic mode - agent reads index and picks which docs to fetch - tools = LLMsTxtTools() - agent = Agent(tools=[tools]) - - # Knowledge mode - bulk load all docs into KB - knowledge = Knowledge(vector_db=my_vector_db) - tools = LLMsTxtTools(knowledge=knowledge) - agent = Agent(tools=[tools], knowledge=knowledge) - """ def __init__( self, From 8bdd0610004be47800b4efd11f39e4712e848ff9 Mon Sep 17 00:00:00 2001 From: Mustafa Esoofally Date: Fri, 10 Apr 2026 12:07:07 -0400 Subject: [PATCH 15/23] =?UTF-8?q?fix:=20clean=20up=20toolkit=20=E2=80=94?= =?UTF-8?q?=20trim=20docstrings,=20simplify=20helpers,=20add=20sections?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Trim tool docstrings: remove repeated llms.txt explanations, keep only what the LLM needs to decide when/how to call the tool - Replace _async_client_kwargs dict builder with _async_client() that returns the client directly - Add section comments to separate helpers / agentic tools / knowledge tools for scannable code - Remove unused Dict import --- libs/agno/agno/tools/llms_txt.py | 66 ++++++++++---------------------- 1 file changed, 20 insertions(+), 46 deletions(-) diff --git a/libs/agno/agno/tools/llms_txt.py b/libs/agno/agno/tools/llms_txt.py index 9c9441fffa..454836e25b 100644 --- a/libs/agno/agno/tools/llms_txt.py +++ b/libs/agno/agno/tools/llms_txt.py @@ -1,5 +1,5 @@ import json -from typing import Any, Dict, List, Optional +from typing import Any, List, Optional import httpx @@ -42,38 +42,28 @@ def __init__( super().__init__(name="llms_txt_tools", tools=tools, async_tools=async_tools_list, **kwargs) - def _async_client_kwargs(self) -> Dict[str, Any]: - """Build kwargs for httpx.AsyncClient matching the reader's config.""" - kwargs: Dict[str, Any] = {"timeout": httpx.Timeout(self.timeout)} - if self.reader.proxy: - kwargs["proxy"] = self.reader.proxy - return kwargs + # ---- Helpers (not exposed to the agent) ---- def _format_index(self, overview: str, entries: list) -> str: - """Build JSON index response from parsed llms.txt data.""" return json.dumps( { "overview": overview, "pages": [ - { - "title": e.title, - "url": e.url, - "description": e.description, - "section": e.section, - } + {"title": e.title, "url": e.url, "description": e.description, "section": e.section} for e in entries ], "total_pages": len(entries), } ) + def _async_client(self) -> httpx.AsyncClient: + return httpx.AsyncClient(timeout=self.timeout, proxy=self.reader.proxy) + + # ---- Tools: Agentic mode (without knowledge) ---- + def get_llms_txt_index(self, url: str) -> str: """Reads an llms.txt file and returns the index of all available documentation pages. - - An llms.txt file is a standardized index of documentation for a project. - This function reads the index and returns all available pages with their titles, - URLs, descriptions, and sections. Use this to discover what documentation is - available, then use read_llms_txt_url to fetch specific pages. + Use this to discover what pages are available, then use read_llms_txt_url to fetch specific pages. :param url: The URL of the llms.txt file (e.g. https://docs.example.com/llms.txt). :return: JSON with the overview and list of available documentation pages. @@ -88,17 +78,13 @@ def get_llms_txt_index(self, url: str) -> str: async def aget_llms_txt_index(self, url: str) -> str: """Reads an llms.txt file and returns the index of all available documentation pages. - - An llms.txt file is a standardized index of documentation for a project. - This function reads the index and returns all available pages with their titles, - URLs, descriptions, and sections. Use this to discover what documentation is - available, then use read_llms_txt_url to fetch specific pages. + Use this to discover what pages are available, then use read_llms_txt_url to fetch specific pages. :param url: The URL of the llms.txt file (e.g. https://docs.example.com/llms.txt). :return: JSON with the overview and list of available documentation pages. """ log_info(f"Reading llms.txt index from {url}") - async with httpx.AsyncClient(**self._async_client_kwargs()) as client: + async with self._async_client() as client: llms_txt_content = await self.reader.async_fetch_url(client, url) if not llms_txt_content: @@ -108,10 +94,8 @@ async def aget_llms_txt_index(self, url: str) -> str: return self._format_index(overview, entries) def read_llms_txt_url(self, url: str) -> str: - """Fetches and returns the content of a specific documentation page URL. - - Use this after calling get_llms_txt_index to fetch the content of specific pages - you want to read. You can call this multiple times for different URLs. + """Fetches and returns the content of a specific documentation page. + Use this after calling get_llms_txt_index to read pages relevant to the user's question. :param url: The URL of the documentation page to read. :return: The text content of the page. @@ -120,33 +104,27 @@ def read_llms_txt_url(self, url: str) -> str: content = self.reader.fetch_url(url) if not content: return f"Failed to fetch content from {url}" - return content async def aread_llms_txt_url(self, url: str) -> str: - """Fetches and returns the content of a specific documentation page URL. - - Use this after calling get_llms_txt_index to fetch the content of specific pages - you want to read. You can call this multiple times for different URLs. + """Fetches and returns the content of a specific documentation page. + Use this after calling get_llms_txt_index to read pages relevant to the user's question. :param url: The URL of the documentation page to read. :return: The text content of the page. """ log_debug(f"Fetching URL: {url}") - async with httpx.AsyncClient(**self._async_client_kwargs()) as client: + async with self._async_client() as client: content = await self.reader.async_fetch_url(client, url) if not content: return f"Failed to fetch content from {url}" - return content - def read_llms_txt_and_load_knowledge(self, url: str) -> str: - """Reads an llms.txt file, fetches all linked documentation pages, and loads them into the knowledge base. + # ---- Tools: Knowledge mode (with knowledge) ---- - An llms.txt file is a standardized index of documentation for a project. - This function reads the index, fetches every linked page, and stores the content - in the knowledge base for future retrieval. + def read_llms_txt_and_load_knowledge(self, url: str) -> str: + """Reads an llms.txt file, fetches all linked pages, and loads them into the knowledge base. :param url: The URL of the llms.txt file (e.g. https://docs.example.com/llms.txt). :return: Summary of what was loaded into the knowledge base. @@ -159,11 +137,7 @@ def read_llms_txt_and_load_knowledge(self, url: str) -> str: return f"Successfully loaded documentation from {url} into the knowledge base" async def aread_llms_txt_and_load_knowledge(self, url: str) -> str: - """Reads an llms.txt file, fetches all linked documentation pages, and loads them into the knowledge base. - - An llms.txt file is a standardized index of documentation for a project. - This function reads the index, fetches every linked page, and stores the content - in the knowledge base for future retrieval. + """Reads an llms.txt file, fetches all linked pages, and loads them into the knowledge base. :param url: The URL of the llms.txt file (e.g. https://docs.example.com/llms.txt). :return: Summary of what was loaded into the knowledge base. From a252b2f45b6de6159a7b8d90abe38769519af551 Mon Sep 17 00:00:00 2001 From: Mustafa Esoofally Date: Fri, 10 Apr 2026 12:09:22 -0400 Subject: [PATCH 16/23] =?UTF-8?q?fix:=20match=20Gmail=20toolkit=20docstrin?= =?UTF-8?q?g=20pattern=20=E2=80=94=20Args/Returns=20style?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Docstrings now use the same format as GmailTools and GoogleCalendarTools: triple-quote, Args (type): description, Returns: type: description. Replaced section dividers with inline comments matching Gmail pattern. Helpers have no docstrings (underscore prefix signals internal use). --- libs/agno/agno/tools/llms_txt.py | 83 ++++++++++++++++++++------------ libs/agno/agno/utils/http.py | 5 +- 2 files changed, 55 insertions(+), 33 deletions(-) diff --git a/libs/agno/agno/tools/llms_txt.py b/libs/agno/agno/tools/llms_txt.py index 454836e25b..8fd00e0f24 100644 --- a/libs/agno/agno/tools/llms_txt.py +++ b/libs/agno/agno/tools/llms_txt.py @@ -10,14 +10,13 @@ class LLMsTxtTools(Toolkit): - def __init__( self, knowledge: Optional[Knowledge] = None, max_urls: int = 20, timeout: int = 60, skip_optional: bool = False, - **kwargs, + **kwargs: Any, ): self.knowledge: Optional[Knowledge] = knowledge self.max_urls = max_urls @@ -31,18 +30,21 @@ def __init__( tools: List[Any] = [] async_tools_list: List[tuple] = [] - if self.knowledge is not None: - tools.append(self.read_llms_txt_and_load_knowledge) - async_tools_list.append((self.aread_llms_txt_and_load_knowledge, "read_llms_txt_and_load_knowledge")) - else: + # Agentic mode — agent picks which pages to read + if self.knowledge is None: tools.append(self.get_llms_txt_index) tools.append(self.read_llms_txt_url) async_tools_list.append((self.aget_llms_txt_index, "get_llms_txt_index")) async_tools_list.append((self.aread_llms_txt_url, "read_llms_txt_url")) + # Knowledge mode — bulk load all pages into vector DB + else: + tools.append(self.read_llms_txt_and_load_knowledge) + async_tools_list.append((self.aread_llms_txt_and_load_knowledge, "read_llms_txt_and_load_knowledge")) super().__init__(name="llms_txt_tools", tools=tools, async_tools=async_tools_list, **kwargs) - # ---- Helpers (not exposed to the agent) ---- + def _async_client(self) -> httpx.AsyncClient: + return httpx.AsyncClient(timeout=self.timeout, proxy=self.reader.proxy) def _format_index(self, overview: str, entries: list) -> str: return json.dumps( @@ -56,17 +58,16 @@ def _format_index(self, overview: str, entries: list) -> str: } ) - def _async_client(self) -> httpx.AsyncClient: - return httpx.AsyncClient(timeout=self.timeout, proxy=self.reader.proxy) - - # ---- Tools: Agentic mode (without knowledge) ---- - def get_llms_txt_index(self, url: str) -> str: - """Reads an llms.txt file and returns the index of all available documentation pages. + """ + Reads an llms.txt file and returns the index of all available documentation pages. Use this to discover what pages are available, then use read_llms_txt_url to fetch specific pages. - :param url: The URL of the llms.txt file (e.g. https://docs.example.com/llms.txt). - :return: JSON with the overview and list of available documentation pages. + Args: + url (str): The URL of the llms.txt file (e.g. https://docs.example.com/llms.txt) + + Returns: + str: JSON with the overview and list of available documentation pages """ log_info(f"Reading llms.txt index from {url}") llms_txt_content = self.reader.fetch_url(url) @@ -77,11 +78,15 @@ def get_llms_txt_index(self, url: str) -> str: return self._format_index(overview, entries) async def aget_llms_txt_index(self, url: str) -> str: - """Reads an llms.txt file and returns the index of all available documentation pages. + """ + Reads an llms.txt file and returns the index of all available documentation pages. Use this to discover what pages are available, then use read_llms_txt_url to fetch specific pages. - :param url: The URL of the llms.txt file (e.g. https://docs.example.com/llms.txt). - :return: JSON with the overview and list of available documentation pages. + Args: + url (str): The URL of the llms.txt file (e.g. https://docs.example.com/llms.txt) + + Returns: + str: JSON with the overview and list of available documentation pages """ log_info(f"Reading llms.txt index from {url}") async with self._async_client() as client: @@ -94,11 +99,15 @@ async def aget_llms_txt_index(self, url: str) -> str: return self._format_index(overview, entries) def read_llms_txt_url(self, url: str) -> str: - """Fetches and returns the content of a specific documentation page. + """ + Fetches and returns the content of a specific documentation page. Use this after calling get_llms_txt_index to read pages relevant to the user's question. - :param url: The URL of the documentation page to read. - :return: The text content of the page. + Args: + url (str): The URL of the documentation page to read + + Returns: + str: The text content of the page """ log_debug(f"Fetching URL: {url}") content = self.reader.fetch_url(url) @@ -107,11 +116,15 @@ def read_llms_txt_url(self, url: str) -> str: return content async def aread_llms_txt_url(self, url: str) -> str: - """Fetches and returns the content of a specific documentation page. + """ + Fetches and returns the content of a specific documentation page. Use this after calling get_llms_txt_index to read pages relevant to the user's question. - :param url: The URL of the documentation page to read. - :return: The text content of the page. + Args: + url (str): The URL of the documentation page to read + + Returns: + str: The text content of the page """ log_debug(f"Fetching URL: {url}") async with self._async_client() as client: @@ -121,13 +134,15 @@ async def aread_llms_txt_url(self, url: str) -> str: return f"Failed to fetch content from {url}" return content - # ---- Tools: Knowledge mode (with knowledge) ---- - def read_llms_txt_and_load_knowledge(self, url: str) -> str: - """Reads an llms.txt file, fetches all linked pages, and loads them into the knowledge base. + """ + Reads an llms.txt file, fetches all linked pages, and loads them into the knowledge base. - :param url: The URL of the llms.txt file (e.g. https://docs.example.com/llms.txt). - :return: Summary of what was loaded into the knowledge base. + Args: + url (str): The URL of the llms.txt file (e.g. https://docs.example.com/llms.txt) + + Returns: + str: Summary of what was loaded into the knowledge base """ if self.knowledge is None: return "Knowledge base not provided" @@ -137,10 +152,14 @@ def read_llms_txt_and_load_knowledge(self, url: str) -> str: return f"Successfully loaded documentation from {url} into the knowledge base" async def aread_llms_txt_and_load_knowledge(self, url: str) -> str: - """Reads an llms.txt file, fetches all linked pages, and loads them into the knowledge base. + """ + Reads an llms.txt file, fetches all linked pages, and loads them into the knowledge base. + + Args: + url (str): The URL of the llms.txt file (e.g. https://docs.example.com/llms.txt) - :param url: The URL of the llms.txt file (e.g. https://docs.example.com/llms.txt). - :return: Summary of what was loaded into the knowledge base. + Returns: + str: Summary of what was loaded into the knowledge base """ if self.knowledge is None: return "Knowledge base not provided" diff --git a/libs/agno/agno/utils/http.py b/libs/agno/agno/utils/http.py index 053767033b..833650dd4d 100644 --- a/libs/agno/agno/utils/http.py +++ b/libs/agno/agno/utils/http.py @@ -187,7 +187,10 @@ def fetch_with_retry( for attempt in range(max_retries): try: response = httpx.get( - url, proxy=proxy, follow_redirects=follow_redirects, timeout=timeout # type: ignore[arg-type] + url, + proxy=proxy, + follow_redirects=follow_redirects, + timeout=timeout, # type: ignore[arg-type] ) response.raise_for_status() return response From bc918b0ddda96e3ea499785f382509556fefa704 Mon Sep 17 00:00:00 2001 From: Mustafa Esoofally Date: Fri, 10 Apr 2026 12:12:44 -0400 Subject: [PATCH 17/23] =?UTF-8?q?fix:=20add=20try/except=20to=20all=20tool?= =?UTF-8?q?s,=20reorder=20methods=20=E2=80=94=20helpers=20then=20public?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Toolkit: every tool method now wrapped in try/except returning error strings, matching Gmail/Calendar pattern. Helpers at top, tools below. Reader: reordered — __init__, classmethods, helpers (_process_response, _build_documents), then public methods (parse_llms_txt, fetch_url, read, async_read). Removed bloated docstrings on helpers. Trimmed class docstring to just the example. --- .../agno/knowledge/reader/llms_txt_reader.py | 160 +++++++----------- libs/agno/agno/tools/llms_txt.py | 82 +++++---- 2 files changed, 110 insertions(+), 132 deletions(-) diff --git a/libs/agno/agno/knowledge/reader/llms_txt_reader.py b/libs/agno/agno/knowledge/reader/llms_txt_reader.py index 614c57e91c..8f0a2a5e78 100644 --- a/libs/agno/agno/knowledge/reader/llms_txt_reader.py +++ b/libs/agno/agno/knowledge/reader/llms_txt_reader.py @@ -2,7 +2,7 @@ import re import uuid from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple from urllib.parse import urljoin import httpx @@ -20,17 +20,12 @@ from agno.utils.http import async_fetch_with_retry, fetch_with_retry from agno.utils.log import log_debug, log_error, log_warning -# Pattern to match markdown links: - [Title](url) or - [Title](url): description -# Note: titles with nested brackets (e.g. [Agent [Beta]](url)) are not supported. _LINK_PATTERN = re.compile(r"-\s+\[([^\]]+)\]\(([^)]+)\)(?::\s*(.+))?") -# Pattern to match H2 section headers _SECTION_PATTERN = re.compile(r"^##\s+(.+)$", re.MULTILINE) @dataclass class LLMsTxtEntry: - """A single entry parsed from an llms.txt file.""" - title: str url: str description: str @@ -38,15 +33,7 @@ class LLMsTxtEntry: class LLMsTxtReader(Reader): - """Reader for llms.txt files. - - Reads an llms.txt file (see https://llmstxt.org), parses all linked documentation URLs, - fetches the content of each linked page, and returns them as Documents. - - The llms.txt format is a standardized markdown file with: - - An H1 heading (project name) - - An optional blockquote summary - - H2-delimited sections containing markdown links to documentation pages + """Reader for llms.txt files (see https://llmstxt.org). Example: reader = LLMsTxtReader(max_urls=20) @@ -60,7 +47,7 @@ def __init__( timeout: int = 60, proxy: Optional[str] = None, skip_optional: bool = False, - **kwargs, + **kwargs: Any, ): if chunking_strategy is None: chunk_size = kwargs.get("chunk_size", 5000) @@ -85,48 +72,9 @@ def get_supported_chunking_strategies(cls) -> List[ChunkingStrategyType]: def get_supported_content_types(cls) -> List[ContentType]: return [ContentType.URL] - def parse_llms_txt(self, content: str, base_url: str) -> Tuple[str, List[LLMsTxtEntry]]: - """Parse an llms.txt file and extract all linked URLs. - - Args: - content: The raw text content of the llms.txt file. - base_url: The base URL for resolving relative links. - - Returns: - A tuple of (overview text, list of LLMsTxtEntry). - """ - entries: List[LLMsTxtEntry] = [] - current_section = "" - overview_lines: List[str] = [] - - for line in content.split("\n"): - section_match = _SECTION_PATTERN.match(line) - if section_match: - current_section = section_match.group(1).strip() - elif not current_section: - overview_lines.append(line) - elif self.skip_optional and current_section.lower() == "optional": - pass - else: - link_match = _LINK_PATTERN.match(line.strip()) - if link_match: - url = link_match.group(2).strip() - if not url.startswith(("http://", "https://")): - url = urljoin(base_url, url) - entries.append( - LLMsTxtEntry( - title=link_match.group(1).strip(), - url=url, - description=(link_match.group(3) or "").strip(), - section=current_section, - ) - ) - - overview = "\n".join(overview_lines).strip() - return overview, entries + # Helpers def _process_response(self, content_type: str, text: str) -> str: - """Classify an HTTP response by content-type and extract text.""" if any(t in content_type for t in ["text/plain", "text/markdown"]): return text @@ -147,28 +95,6 @@ def _process_response(self, content_type: str, text: str) -> str: return text - def fetch_url(self, url: str) -> Optional[str]: - """Fetch a URL and return its text content, or None on failure.""" - try: - response = fetch_with_retry( - url, max_retries=1, proxy=self.proxy, timeout=self.timeout, follow_redirects=True - ) - return self._process_response(response.headers.get("content-type", ""), response.text) - except Exception as e: - log_warning(f"Failed to fetch {url}: {e}") - return None - - async def async_fetch_url(self, client: httpx.AsyncClient, url: str) -> Optional[str]: - """Async variant of fetch_url using a shared client.""" - try: - response = await async_fetch_with_retry( - url, client=client, max_retries=1, timeout=self.timeout, follow_redirects=True - ) - return self._process_response(response.headers.get("content-type", ""), response.text) - except Exception as e: - log_warning(f"Failed to fetch {url}: {e}") - return None - def _build_documents( self, overview: str, @@ -177,17 +103,13 @@ def _build_documents( llms_txt_url: str, name: Optional[str], ) -> List[Document]: - """Build Document list from fetched content.""" documents: List[Document] = [] if overview: doc = Document( name=name or llms_txt_url, id=str(uuid.uuid4()), - meta_data={ - "url": llms_txt_url, - "type": "llms_txt_overview", - }, + meta_data={"url": llms_txt_url, "type": "llms_txt_overview"}, content=overview, ) if self.chunk: @@ -218,16 +140,60 @@ def _build_documents( return documents - def read(self, url: str, name: Optional[str] = None) -> List[Document]: - """Read an llms.txt file and all its linked documentation. + # Public methods - Args: - url: The URL of the llms.txt file. - name: Optional name for the documents. + def parse_llms_txt(self, content: str, base_url: str) -> Tuple[str, List[LLMsTxtEntry]]: + entries: List[LLMsTxtEntry] = [] + current_section = "" + overview_lines: List[str] = [] - Returns: - A list of documents from the llms.txt and all linked pages. - """ + for line in content.split("\n"): + section_match = _SECTION_PATTERN.match(line) + if section_match: + current_section = section_match.group(1).strip() + elif not current_section: + overview_lines.append(line) + elif self.skip_optional and current_section.lower() == "optional": + pass + else: + link_match = _LINK_PATTERN.match(line.strip()) + if link_match: + url = link_match.group(2).strip() + if not url.startswith(("http://", "https://")): + url = urljoin(base_url, url) + entries.append( + LLMsTxtEntry( + title=link_match.group(1).strip(), + url=url, + description=(link_match.group(3) or "").strip(), + section=current_section, + ) + ) + + overview = "\n".join(overview_lines).strip() + return overview, entries + + def fetch_url(self, url: str) -> Optional[str]: + try: + response = fetch_with_retry( + url, max_retries=1, proxy=self.proxy, timeout=self.timeout, follow_redirects=True + ) + return self._process_response(response.headers.get("content-type", ""), response.text) + except Exception as e: + log_warning(f"Failed to fetch {url}: {e}") + return None + + async def async_fetch_url(self, client: httpx.AsyncClient, url: str) -> Optional[str]: + try: + response = await async_fetch_with_retry( + url, client=client, max_retries=1, timeout=self.timeout, follow_redirects=True + ) + return self._process_response(response.headers.get("content-type", ""), response.text) + except Exception as e: + log_warning(f"Failed to fetch {url}: {e}") + return None + + def read(self, url: str, name: Optional[str] = None) -> List[Document]: log_debug(f"Reading llms.txt: {url}") llms_txt_content = self.fetch_url(url) if not llms_txt_content: @@ -251,15 +217,6 @@ def read(self, url: str, name: Optional[str] = None) -> List[Document]: return self._build_documents(overview, entries_to_fetch, fetched, url, name) async def async_read(self, url: str, name: Optional[str] = None) -> List[Document]: - """Asynchronously read an llms.txt file and all its linked documentation. - - Args: - url: The URL of the llms.txt file. - name: Optional name for the documents. - - Returns: - A list of documents from the llms.txt and all linked pages. - """ log_debug(f"Reading llms.txt asynchronously: {url}") async with httpx.AsyncClient(proxy=self.proxy) as client: llms_txt_content = await self.async_fetch_url(client, url) @@ -274,8 +231,7 @@ async def async_read(self, url: str, name: Optional[str] = None) -> List[Documen if len(entries) > self.max_urls: log_warning(f"Limiting to {self.max_urls} URLs (found {len(entries)})") - # httpx AsyncClient limits concurrent connections per host (default 20), - # so we don't need application-level throttling + # httpx AsyncClient limits concurrent connections per host (default 20) async def _fetch_entry(entry: LLMsTxtEntry) -> Tuple[str, Optional[str]]: content = await self.async_fetch_url(client, entry.url) return entry.url, content diff --git a/libs/agno/agno/tools/llms_txt.py b/libs/agno/agno/tools/llms_txt.py index 8fd00e0f24..699d1c2c2d 100644 --- a/libs/agno/agno/tools/llms_txt.py +++ b/libs/agno/agno/tools/llms_txt.py @@ -43,6 +43,8 @@ def __init__( super().__init__(name="llms_txt_tools", tools=tools, async_tools=async_tools_list, **kwargs) + # Helpers + def _async_client(self) -> httpx.AsyncClient: return httpx.AsyncClient(timeout=self.timeout, proxy=self.reader.proxy) @@ -58,6 +60,8 @@ def _format_index(self, overview: str, entries: list) -> str: } ) + # Tools + def get_llms_txt_index(self, url: str) -> str: """ Reads an llms.txt file and returns the index of all available documentation pages. @@ -69,13 +73,16 @@ def get_llms_txt_index(self, url: str) -> str: Returns: str: JSON with the overview and list of available documentation pages """ - log_info(f"Reading llms.txt index from {url}") - llms_txt_content = self.reader.fetch_url(url) - if not llms_txt_content: - return f"Failed to fetch llms.txt from {url}" + try: + log_info(f"Reading llms.txt index from {url}") + llms_txt_content = self.reader.fetch_url(url) + if not llms_txt_content: + return f"Failed to fetch llms.txt from {url}" - overview, entries = self.reader.parse_llms_txt(llms_txt_content, url) - return self._format_index(overview, entries) + overview, entries = self.reader.parse_llms_txt(llms_txt_content, url) + return self._format_index(overview, entries) + except Exception as e: + return f"Error reading llms.txt index from {url}: {type(e).__name__}: {e}" async def aget_llms_txt_index(self, url: str) -> str: """ @@ -88,15 +95,18 @@ async def aget_llms_txt_index(self, url: str) -> str: Returns: str: JSON with the overview and list of available documentation pages """ - log_info(f"Reading llms.txt index from {url}") - async with self._async_client() as client: - llms_txt_content = await self.reader.async_fetch_url(client, url) + try: + log_info(f"Reading llms.txt index from {url}") + async with self._async_client() as client: + llms_txt_content = await self.reader.async_fetch_url(client, url) - if not llms_txt_content: - return f"Failed to fetch llms.txt from {url}" + if not llms_txt_content: + return f"Failed to fetch llms.txt from {url}" - overview, entries = self.reader.parse_llms_txt(llms_txt_content, url) - return self._format_index(overview, entries) + overview, entries = self.reader.parse_llms_txt(llms_txt_content, url) + return self._format_index(overview, entries) + except Exception as e: + return f"Error reading llms.txt index from {url}: {type(e).__name__}: {e}" def read_llms_txt_url(self, url: str) -> str: """ @@ -109,11 +119,14 @@ def read_llms_txt_url(self, url: str) -> str: Returns: str: The text content of the page """ - log_debug(f"Fetching URL: {url}") - content = self.reader.fetch_url(url) - if not content: - return f"Failed to fetch content from {url}" - return content + try: + log_debug(f"Fetching URL: {url}") + content = self.reader.fetch_url(url) + if not content: + return f"Failed to fetch content from {url}" + return content + except Exception as e: + return f"Error fetching {url}: {type(e).__name__}: {e}" async def aread_llms_txt_url(self, url: str) -> str: """ @@ -126,13 +139,16 @@ async def aread_llms_txt_url(self, url: str) -> str: Returns: str: The text content of the page """ - log_debug(f"Fetching URL: {url}") - async with self._async_client() as client: - content = await self.reader.async_fetch_url(client, url) + try: + log_debug(f"Fetching URL: {url}") + async with self._async_client() as client: + content = await self.reader.async_fetch_url(client, url) - if not content: - return f"Failed to fetch content from {url}" - return content + if not content: + return f"Failed to fetch content from {url}" + return content + except Exception as e: + return f"Error fetching {url}: {type(e).__name__}: {e}" def read_llms_txt_and_load_knowledge(self, url: str) -> str: """ @@ -147,9 +163,12 @@ def read_llms_txt_and_load_knowledge(self, url: str) -> str: if self.knowledge is None: return "Knowledge base not provided" - log_info(f"Reading llms.txt from {url}") - self.knowledge.insert(url=url, reader=self.reader) - return f"Successfully loaded documentation from {url} into the knowledge base" + try: + log_info(f"Reading llms.txt from {url}") + self.knowledge.insert(url=url, reader=self.reader) + return f"Successfully loaded documentation from {url} into the knowledge base" + except Exception as e: + return f"Error loading knowledge from {url}: {type(e).__name__}: {e}" async def aread_llms_txt_and_load_knowledge(self, url: str) -> str: """ @@ -164,6 +183,9 @@ async def aread_llms_txt_and_load_knowledge(self, url: str) -> str: if self.knowledge is None: return "Knowledge base not provided" - log_info(f"Reading llms.txt from {url}") - await self.knowledge.ainsert(url=url, reader=self.reader) - return f"Successfully loaded documentation from {url} into the knowledge base" + try: + log_info(f"Reading llms.txt from {url}") + await self.knowledge.ainsert(url=url, reader=self.reader) + return f"Successfully loaded documentation from {url} into the knowledge base" + except Exception as e: + return f"Error loading knowledge from {url}: {type(e).__name__}: {e}" From beea0b0a4bfc571b396fa59c02fe8b5c17b4477e Mon Sep 17 00:00:00 2001 From: Mustafa Esoofally Date: Fri, 10 Apr 2026 12:14:02 -0400 Subject: [PATCH 18/23] fix: replace Any with proper types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tools list uses Callable instead of Any. Removed Any from kwargs (untyped kwargs is the codebase pattern — other toolkits don't type it). --- libs/agno/agno/knowledge/reader/llms_txt_reader.py | 4 ++-- libs/agno/agno/tools/llms_txt.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/libs/agno/agno/knowledge/reader/llms_txt_reader.py b/libs/agno/agno/knowledge/reader/llms_txt_reader.py index 8f0a2a5e78..4be8058b6e 100644 --- a/libs/agno/agno/knowledge/reader/llms_txt_reader.py +++ b/libs/agno/agno/knowledge/reader/llms_txt_reader.py @@ -2,7 +2,7 @@ import re import uuid from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple from urllib.parse import urljoin import httpx @@ -47,7 +47,7 @@ def __init__( timeout: int = 60, proxy: Optional[str] = None, skip_optional: bool = False, - **kwargs: Any, + **kwargs, ): if chunking_strategy is None: chunk_size = kwargs.get("chunk_size", 5000) diff --git a/libs/agno/agno/tools/llms_txt.py b/libs/agno/agno/tools/llms_txt.py index 699d1c2c2d..cd33a0b0be 100644 --- a/libs/agno/agno/tools/llms_txt.py +++ b/libs/agno/agno/tools/llms_txt.py @@ -1,5 +1,5 @@ import json -from typing import Any, List, Optional +from typing import Callable, List, Optional import httpx @@ -16,7 +16,7 @@ def __init__( max_urls: int = 20, timeout: int = 60, skip_optional: bool = False, - **kwargs: Any, + **kwargs, ): self.knowledge: Optional[Knowledge] = knowledge self.max_urls = max_urls @@ -28,7 +28,7 @@ def __init__( skip_optional=skip_optional, ) - tools: List[Any] = [] + tools: List[Callable] = [] async_tools_list: List[tuple] = [] # Agentic mode — agent picks which pages to read if self.knowledge is None: From 7ebc5a2408e4212251fe3246aa1944341aae1dcd Mon Sep 17 00:00:00 2001 From: Mustafa Esoofally Date: Fri, 10 Apr 2026 12:19:33 -0400 Subject: [PATCH 19/23] =?UTF-8?q?test:=20rewrite=20tests=20following=20Per?= =?UTF-8?q?plexity/Gmail=20pattern=20=E2=80=94=2046=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restructured from class-based to flat functions with @pytest.fixture, matching test_perplexity.py and test_gmail_tools.py patterns. New coverage: - Async reader: async_read happy path + failure - Async toolkit: aget_llms_txt_index, aread_llms_txt_url, aread_llms_txt_and_load_knowledge - Error handling: try/except returns error strings - Edge cases: empty overview, HTML sniffing, unknown content-type - Shared _mock_httpx_response helper for DRY mock setup 34 tests -> 46 tests --- libs/agno/tests/unit/tools/test_llms_txt.py | 671 ++++++++++++-------- 1 file changed, 417 insertions(+), 254 deletions(-) diff --git a/libs/agno/tests/unit/tools/test_llms_txt.py b/libs/agno/tests/unit/tools/test_llms_txt.py index edfb6023c9..f4e3c47b0f 100644 --- a/libs/agno/tests/unit/tools/test_llms_txt.py +++ b/libs/agno/tests/unit/tools/test_llms_txt.py @@ -1,7 +1,7 @@ """Unit tests for LLMsTxtTools and LLMsTxtReader.""" import json -from unittest.mock import MagicMock, patch +from unittest.mock import AsyncMock, MagicMock, Mock, patch import httpx import pytest @@ -12,7 +12,7 @@ from agno.tools.llms_txt import LLMsTxtTools # noqa: E402 # --------------------------------------------------------------------------- -# Sample llms.txt content for testing +# Fixtures # --------------------------------------------------------------------------- SAMPLE_LLMS_TXT = """# Acme Project @@ -49,331 +49,494 @@ """ -# --------------------------------------------------------------------------- -# LLMsTxtReader tests -# --------------------------------------------------------------------------- +@pytest.fixture +def reader(): + return LLMsTxtReader(chunk=False) -class TestLLMsTxtReaderInit: - def test_defaults(self): - reader = LLMsTxtReader() - assert reader.max_urls == 20 - assert reader.timeout == 60 - assert reader.proxy is None - assert reader.skip_optional is False - - def test_custom_params(self): - reader = LLMsTxtReader(max_urls=50, timeout=10, skip_optional=True) - assert reader.max_urls == 50 - assert reader.timeout == 10 - assert reader.skip_optional is True - - -class TestParseLLMsTxt: - def test_parses_entries(self): - reader = LLMsTxtReader() - overview, entries = reader.parse_llms_txt(SAMPLE_LLMS_TXT, "https://docs.acme.com/llms.txt") - - assert len(entries) == 7 - assert entries[0].title == "Introduction" - assert entries[0].url == "https://docs.acme.com/introduction" - assert entries[0].description == "Overview of Acme" - assert entries[0].section == "Getting Started" - - def test_parses_overview(self): - reader = LLMsTxtReader() - overview, entries = reader.parse_llms_txt(SAMPLE_LLMS_TXT, "https://docs.acme.com/llms.txt") - - assert "# Acme Project" in overview - assert "Acme makes it easy" in overview - - def test_sections_assigned(self): - reader = LLMsTxtReader() - _, entries = reader.parse_llms_txt(SAMPLE_LLMS_TXT, "https://docs.acme.com/llms.txt") +@pytest.fixture +def tools(): + return LLMsTxtTools() - sections = {e.section for e in entries} - assert sections == {"Getting Started", "API Reference", "Optional"} - def test_skip_optional(self): - reader = LLMsTxtReader(skip_optional=True) - _, entries = reader.parse_llms_txt(SAMPLE_LLMS_TXT, "https://docs.acme.com/llms.txt") +@pytest.fixture +def tools_with_knowledge(): + mock_knowledge = MagicMock() + return LLMsTxtTools(knowledge=mock_knowledge) - assert len(entries) == 5 - assert all(e.section != "Optional" for e in entries) - def test_relative_urls_resolved(self): - reader = LLMsTxtReader() - _, entries = reader.parse_llms_txt(SAMPLE_LLMS_TXT_RELATIVE, "https://example.com/llms.txt") +def _mock_httpx_response(text: str, content_type: str = "text/plain") -> Mock: + resp = Mock() + resp.headers = {"content-type": content_type} + resp.text = text + resp.raise_for_status = Mock() + return resp - assert entries[0].url == "https://example.com/docs/guide" - assert entries[1].url == "https://example.com/api/reference" - def test_empty_content(self): - reader = LLMsTxtReader() - overview, entries = reader.parse_llms_txt("", "https://example.com/llms.txt") +# ============================================================================ +# READER: INIT +# ============================================================================ - assert overview == "" - assert entries == [] - def test_no_links(self): - content = "# Title\n\nSome overview text.\n\n## Section\n\nNo links here." - reader = LLMsTxtReader() - overview, entries = reader.parse_llms_txt(content, "https://example.com/llms.txt") - - assert "# Title" in overview - assert entries == [] +def test_reader_defaults(): + reader = LLMsTxtReader() + assert reader.max_urls == 20 + assert reader.timeout == 60 + assert reader.proxy is None + assert reader.skip_optional is False -class TestProcessResponse: - def test_extracts_from_main_tag(self): - reader = LLMsTxtReader() - html = "
Main content here
Foot
" - result = reader._process_response("text/html", html) - assert "Main content here" in result - assert "Nav" not in result - - def test_extracts_from_body_fallback(self): - reader = LLMsTxtReader() - html = "
Body content
" - result = reader._process_response("text/html", html) - assert "Body content" in result +def test_reader_custom_params(): + reader = LLMsTxtReader(max_urls=50, timeout=10, skip_optional=True) + assert reader.max_urls == 50 + assert reader.timeout == 10 + assert reader.skip_optional is True - def test_strips_script_and_style(self): - reader = LLMsTxtReader() - html = "

Text

" - result = reader._process_response("text/html", html) - assert "var x" not in result - assert "Text" in result - def test_preserves_structure_with_newlines(self): - reader = LLMsTxtReader() - html = "

First paragraph

Second paragraph

" - result = reader._process_response("text/html", html) - assert "First paragraph" in result - assert "Second paragraph" in result - assert "\n" in result +# ============================================================================ +# READER: PARSE +# ============================================================================ -class TestFetchUrl: - def test_returns_text_for_plain_content(self): - reader = LLMsTxtReader() - mock_response = MagicMock() - mock_response.headers = {"content-type": "text/plain"} - mock_response.text = "Plain text content" - mock_response.raise_for_status = MagicMock() +def test_parse_entries(reader): + overview, entries = reader.parse_llms_txt(SAMPLE_LLMS_TXT, "https://docs.acme.com/llms.txt") - with patch("httpx.get", return_value=mock_response): - result = reader.fetch_url("https://example.com/file.txt") + assert len(entries) == 7 + assert entries[0].title == "Introduction" + assert entries[0].url == "https://docs.acme.com/introduction" + assert entries[0].description == "Overview of Acme" + assert entries[0].section == "Getting Started" - assert result == "Plain text content" - def test_extracts_html_content(self): - reader = LLMsTxtReader() - mock_response = MagicMock() - mock_response.headers = {"content-type": "text/html"} - mock_response.text = "
Extracted
" - mock_response.raise_for_status = MagicMock() +def test_parse_overview(reader): + overview, _ = reader.parse_llms_txt(SAMPLE_LLMS_TXT, "https://docs.acme.com/llms.txt") - with patch("httpx.get", return_value=mock_response): - result = reader.fetch_url("https://example.com/page") + assert "# Acme Project" in overview + assert "Acme makes it easy" in overview - assert "Extracted" in result - def test_returns_none_on_http_error(self): - reader = LLMsTxtReader() +def test_parse_sections(reader): + _, entries = reader.parse_llms_txt(SAMPLE_LLMS_TXT, "https://docs.acme.com/llms.txt") - with patch( - "httpx.get", - side_effect=httpx.HTTPStatusError("error", request=MagicMock(), response=MagicMock(status_code=404)), - ): - result = reader.fetch_url("https://example.com/missing") + sections = {e.section for e in entries} + assert sections == {"Getting Started", "API Reference", "Optional"} - assert result is None - def test_returns_none_on_request_error(self): - reader = LLMsTxtReader() +def test_parse_skip_optional(): + reader = LLMsTxtReader(skip_optional=True) + _, entries = reader.parse_llms_txt(SAMPLE_LLMS_TXT, "https://docs.acme.com/llms.txt") - with patch("httpx.get", side_effect=httpx.RequestError("connection failed")): - result = reader.fetch_url("https://example.com/down") + assert len(entries) == 5 + assert all(e.section != "Optional" for e in entries) - assert result is None +def test_parse_relative_urls(reader): + _, entries = reader.parse_llms_txt(SAMPLE_LLMS_TXT_RELATIVE, "https://example.com/llms.txt") -class TestBuildDocuments: - def test_builds_overview_and_linked_docs(self): - reader = LLMsTxtReader(chunk=False) - entries = [ - LLMsTxtEntry(title="Intro", url="https://example.com/intro", description="Intro page", section="Docs"), - ] - fetched = {"https://example.com/intro": "Introduction content here"} + assert entries[0].url == "https://example.com/docs/guide" + assert entries[1].url == "https://example.com/api/reference" - docs = reader._build_documents("Overview text", entries, fetched, "https://example.com/llms.txt", None) - assert len(docs) == 2 - assert docs[0].meta_data["type"] == "llms_txt_overview" - assert docs[0].content == "Overview text" - assert docs[1].meta_data["type"] == "llms_txt_linked_doc" - assert docs[1].name == "Intro" - assert docs[1].content == "Introduction content here" +def test_parse_empty_content(reader): + overview, entries = reader.parse_llms_txt("", "https://example.com/llms.txt") - def test_skips_unfetched_entries(self): - reader = LLMsTxtReader(chunk=False) - entries = [ - LLMsTxtEntry(title="Missing", url="https://example.com/missing", description="", section="Docs"), - ] - fetched = {} + assert overview == "" + assert entries == [] - docs = reader._build_documents("Overview", entries, fetched, "https://example.com/llms.txt", None) - # Only the overview doc - assert len(docs) == 1 +def test_parse_no_links(reader): + content = "# Title\n\nSome overview text.\n\n## Section\n\nNo links here." + overview, entries = reader.parse_llms_txt(content, "https://example.com/llms.txt") + assert "# Title" in overview + assert entries == [] -class TestRead: - def test_read_fetches_and_builds_docs(self): - reader = LLMsTxtReader(max_urls=5, chunk=False) - def mock_fetch(url): - if url == "https://example.com/llms.txt": - return SAMPLE_LLMS_TXT - return f"Content of {url}" +# ============================================================================ +# READER: PROCESS RESPONSE +# ============================================================================ - with patch.object(reader, "fetch_url", side_effect=mock_fetch): - docs = reader.read("https://example.com/llms.txt") - # 1 overview + 5 linked docs (max_urls=5) - assert len(docs) == 6 - assert docs[0].meta_data["type"] == "llms_txt_overview" +def test_process_response_plain_text(reader): + result = reader._process_response("text/plain", "Plain text content") + assert result == "Plain text content" - def test_read_returns_empty_on_fetch_failure(self): - reader = LLMsTxtReader() - with patch.object(reader, "fetch_url", return_value=None): - docs = reader.read("https://example.com/llms.txt") +def test_process_response_markdown(reader): + result = reader._process_response("text/markdown", "# Heading\n\nBody") + assert result == "# Heading\n\nBody" - assert docs == [] - def test_max_urls_limits_fetched_pages(self): - reader = LLMsTxtReader(max_urls=2, chunk=False) +def test_process_response_html_extracts_main(reader): + html = "
Main content here
Foot
" + result = reader._process_response("text/html", html) + assert "Main content here" in result + assert "Nav" not in result - def mock_fetch(url): - if url == "https://example.com/llms.txt": - return SAMPLE_LLMS_TXT - return f"Content of {url}" - with patch.object(reader, "fetch_url", side_effect=mock_fetch): - docs = reader.read("https://example.com/llms.txt") +def test_process_response_html_body_fallback(reader): + html = "
Body content
" + result = reader._process_response("text/html", html) + assert "Body content" in result - # 1 overview + 2 linked docs (max_urls=2) - assert len(docs) == 3 +def test_process_response_strips_scripts(reader): + html = "

Text

" + result = reader._process_response("text/html", html) + assert "var x" not in result + assert "Text" in result + + +def test_process_response_newline_separator(reader): + html = "

First paragraph

Second paragraph

" + result = reader._process_response("text/html", html) + assert "First paragraph" in result + assert "Second paragraph" in result + assert "\n" in result + + +def test_process_response_html_sniffing(reader): + """HTML detected by content prefix when content-type header is missing.""" + result = reader._process_response("", "

Sniffed

") + assert "Sniffed" in result + + +def test_process_response_unknown_content_type(reader): + """Unknown content-type returns raw text.""" + result = reader._process_response("application/json", '{"key": "value"}') + assert result == '{"key": "value"}' + + +# ============================================================================ +# READER: FETCH +# ============================================================================ + + +def test_fetch_url_plain_content(reader): + mock_response = _mock_httpx_response("Plain text content", "text/plain") + + with patch("agno.utils.http.httpx.get", return_value=mock_response): + result = reader.fetch_url("https://example.com/file.txt") + + assert result == "Plain text content" + + +def test_fetch_url_html_content(reader): + mock_response = _mock_httpx_response("
Extracted
", "text/html") + + with patch("agno.utils.http.httpx.get", return_value=mock_response): + result = reader.fetch_url("https://example.com/page") + + assert "Extracted" in result + + +def test_fetch_url_http_error(reader): + with patch( + "agno.utils.http.httpx.get", + side_effect=httpx.HTTPStatusError("error", request=MagicMock(), response=MagicMock(status_code=404)), + ): + result = reader.fetch_url("https://example.com/missing") + + assert result is None + + +def test_fetch_url_request_error(reader): + with patch("agno.utils.http.httpx.get", side_effect=httpx.RequestError("connection failed")): + result = reader.fetch_url("https://example.com/down") + + assert result is None + + +# ============================================================================ +# READER: BUILD DOCUMENTS +# ============================================================================ + + +def test_build_documents_overview_and_linked(reader): + entries = [ + LLMsTxtEntry(title="Intro", url="https://example.com/intro", description="Intro page", section="Docs"), + ] + fetched = {"https://example.com/intro": "Introduction content here"} + + docs = reader._build_documents("Overview text", entries, fetched, "https://example.com/llms.txt", None) + + assert len(docs) == 2 + assert docs[0].meta_data["type"] == "llms_txt_overview" + assert docs[0].content == "Overview text" + assert docs[1].meta_data["type"] == "llms_txt_linked_doc" + assert docs[1].name == "Intro" + assert docs[1].content == "Introduction content here" + + +def test_build_documents_skips_unfetched(reader): + entries = [ + LLMsTxtEntry(title="Missing", url="https://example.com/missing", description="", section="Docs"), + ] + docs = reader._build_documents("Overview", entries, {}, "https://example.com/llms.txt", None) + + assert len(docs) == 1 + assert docs[0].meta_data["type"] == "llms_txt_overview" + + +def test_build_documents_empty_overview(reader): + entries = [ + LLMsTxtEntry(title="Page", url="https://example.com/page", description="", section="Docs"), + ] + fetched = {"https://example.com/page": "Page content"} + + docs = reader._build_documents("", entries, fetched, "https://example.com/llms.txt", None) + + assert len(docs) == 1 + assert docs[0].meta_data["type"] == "llms_txt_linked_doc" + + +# ============================================================================ +# READER: READ +# ============================================================================ + + +def test_read_fetches_and_builds(): + reader = LLMsTxtReader(max_urls=5, chunk=False) + + def mock_fetch(url): + if url == "https://example.com/llms.txt": + return SAMPLE_LLMS_TXT + return f"Content of {url}" + + with patch.object(reader, "fetch_url", side_effect=mock_fetch): + docs = reader.read("https://example.com/llms.txt") + + assert len(docs) == 6 + assert docs[0].meta_data["type"] == "llms_txt_overview" + + +def test_read_returns_empty_on_failure(): + reader = LLMsTxtReader() + + with patch.object(reader, "fetch_url", return_value=None): + docs = reader.read("https://example.com/llms.txt") + + assert docs == [] + + +def test_read_max_urls_limits(): + reader = LLMsTxtReader(max_urls=2, chunk=False) + + def mock_fetch(url): + if url == "https://example.com/llms.txt": + return SAMPLE_LLMS_TXT + return f"Content of {url}" + + with patch.object(reader, "fetch_url", side_effect=mock_fetch): + docs = reader.read("https://example.com/llms.txt") + + assert len(docs) == 3 + + +# ============================================================================ +# READER: ASYNC READ +# ============================================================================ + + +@pytest.mark.asyncio +async def test_async_read_fetches_concurrently(): + reader = LLMsTxtReader(max_urls=3, chunk=False) + + async def mock_async_fetch(client, url): + if "llms.txt" in url: + return SAMPLE_LLMS_TXT + return f"Content of {url}" + + with patch.object(reader, "async_fetch_url", side_effect=mock_async_fetch): + docs = await reader.async_read("https://example.com/llms.txt") + + assert len(docs) == 4 + assert docs[0].meta_data["type"] == "llms_txt_overview" + + +@pytest.mark.asyncio +async def test_async_read_returns_empty_on_failure(): + reader = LLMsTxtReader() + + async def mock_async_fetch(client, url): + return None + + with patch.object(reader, "async_fetch_url", side_effect=mock_async_fetch): + docs = await reader.async_read("https://example.com/llms.txt") + + assert docs == [] + + +# ============================================================================ +# TOOLKIT: INIT +# ============================================================================ + + +def test_toolkit_agentic_tools(tools): + func_names = [func.name for func in tools.functions.values()] + assert "get_llms_txt_index" in func_names + assert "read_llms_txt_url" in func_names + assert "read_llms_txt_and_load_knowledge" not in func_names + + +def test_toolkit_async_tools(tools): + async_func_names = [func.name for func in tools.async_functions.values()] + assert "get_llms_txt_index" in async_func_names + assert "read_llms_txt_url" in async_func_names + + +def test_toolkit_knowledge_tools(tools_with_knowledge): + func_names = [func.name for func in tools_with_knowledge.functions.values()] + assert "read_llms_txt_and_load_knowledge" in func_names + assert "get_llms_txt_index" not in func_names + + +def test_toolkit_knowledge_async_tools(tools_with_knowledge): + async_func_names = [func.name for func in tools_with_knowledge.async_functions.values()] + assert "read_llms_txt_and_load_knowledge" in async_func_names + + +def test_toolkit_custom_params(): + t = LLMsTxtTools(max_urls=50, timeout=10, skip_optional=True) + assert t.max_urls == 50 + assert t.timeout == 10 + assert t.skip_optional is True + + +def test_toolkit_reader_reuse(tools): + assert tools.reader is not None + assert tools.reader.timeout == tools.timeout + assert tools.reader.max_urls == tools.max_urls + + +# ============================================================================ +# TOOLKIT: GET INDEX +# ============================================================================ + + +def test_get_index_returns_json(tools): + mock_response = _mock_httpx_response(SAMPLE_LLMS_TXT, "text/plain") + + with patch("agno.utils.http.httpx.get", return_value=mock_response): + result = tools.get_llms_txt_index("https://docs.acme.com/llms.txt") + + data = json.loads(result) + assert data["total_pages"] == 7 + assert data["pages"][0]["title"] == "Introduction" + assert data["pages"][0]["url"] == "https://docs.acme.com/introduction" + assert "overview" in data + + +def test_get_index_failure(tools): + with patch("agno.utils.http.httpx.get", side_effect=httpx.RequestError("connection failed")): + result = tools.get_llms_txt_index("https://example.com/llms.txt") + + assert "Failed to fetch" in result + + +def test_get_index_error_handling(tools): + with patch.object(tools.reader, "fetch_url", side_effect=RuntimeError("unexpected")): + result = tools.get_llms_txt_index("https://example.com/llms.txt") + + assert "Error" in result + assert "RuntimeError" in result + + +# ============================================================================ +# TOOLKIT: READ URL +# ============================================================================ + + +def test_read_url_returns_content(tools): + mock_response = _mock_httpx_response("Page content here", "text/plain") + + with patch("agno.utils.http.httpx.get", return_value=mock_response): + result = tools.read_llms_txt_url("https://docs.acme.com/introduction") + + assert result == "Page content here" + + +def test_read_url_failure(tools): + with patch("agno.utils.http.httpx.get", side_effect=httpx.RequestError("connection failed")): + result = tools.read_llms_txt_url("https://example.com/missing") + + assert "Failed to fetch" in result -# --------------------------------------------------------------------------- -# LLMsTxtTools tests -# --------------------------------------------------------------------------- +# ============================================================================ +# TOOLKIT: ASYNC TOOLS +# ============================================================================ -class TestLLMsTxtToolsInit: - def test_without_knowledge_registers_agentic_tools(self): - tools = LLMsTxtTools() - func_names = [func.name for func in tools.functions.values()] - assert "get_llms_txt_index" in func_names - assert "read_llms_txt_url" in func_names - assert "read_llms_txt_and_load_knowledge" not in func_names - def test_without_knowledge_registers_async_tools(self): - tools = LLMsTxtTools() - async_func_names = [func.name for func in tools.async_functions.values()] - assert "get_llms_txt_index" in async_func_names - assert "read_llms_txt_url" in async_func_names +@pytest.mark.asyncio +async def test_aget_index_returns_json(tools): + mock_response = _mock_httpx_response(SAMPLE_LLMS_TXT, "text/plain") - def test_with_knowledge_registers_load(self): - mock_knowledge = MagicMock() - tools = LLMsTxtTools(knowledge=mock_knowledge) - func_names = [func.name for func in tools.functions.values()] - assert "read_llms_txt_and_load_knowledge" in func_names - assert "get_llms_txt_index" not in func_names + mock_client = AsyncMock() + mock_client.get.return_value = mock_response - def test_with_knowledge_registers_async_load(self): - mock_knowledge = MagicMock() - tools = LLMsTxtTools(knowledge=mock_knowledge) - async_func_names = [func.name for func in tools.async_functions.values()] - assert "read_llms_txt_and_load_knowledge" in async_func_names + with patch("agno.tools.llms_txt.httpx.AsyncClient") as mock_async_client: + mock_async_client.return_value.__aenter__ = AsyncMock(return_value=mock_client) + mock_async_client.return_value.__aexit__ = AsyncMock(return_value=False) - def test_custom_params(self): - tools = LLMsTxtTools(max_urls=50, timeout=10, skip_optional=True) - assert tools.max_urls == 50 - assert tools.timeout == 10 - assert tools.skip_optional is True + result = await tools.aget_llms_txt_index("https://docs.acme.com/llms.txt") - def test_reader_is_reused(self): - tools = LLMsTxtTools() - assert tools.reader is not None - assert tools.reader.timeout == tools.timeout - assert tools.reader.max_urls == tools.max_urls + data = json.loads(result) + assert data["total_pages"] == 7 + assert data["pages"][0]["title"] == "Introduction" -class TestGetLLMsTxtIndex: - def test_returns_index_json(self): - tools = LLMsTxtTools() +@pytest.mark.asyncio +async def test_aread_url_returns_content(tools): + mock_response = _mock_httpx_response("Async page content", "text/plain") - mock_response = MagicMock() - mock_response.headers = {"content-type": "text/plain"} - mock_response.text = SAMPLE_LLMS_TXT - mock_response.raise_for_status = MagicMock() + mock_client = AsyncMock() + mock_client.get.return_value = mock_response - with patch("httpx.get", return_value=mock_response): - result = tools.get_llms_txt_index("https://docs.acme.com/llms.txt") + with patch("agno.tools.llms_txt.httpx.AsyncClient") as mock_async_client: + mock_async_client.return_value.__aenter__ = AsyncMock(return_value=mock_client) + mock_async_client.return_value.__aexit__ = AsyncMock(return_value=False) - data = json.loads(result) - assert data["total_pages"] == 7 - assert data["pages"][0]["title"] == "Introduction" - assert data["pages"][0]["url"] == "https://docs.acme.com/introduction" - assert "overview" in data + result = await tools.aread_llms_txt_url("https://docs.acme.com/page") - def test_returns_error_on_fetch_failure(self): - tools = LLMsTxtTools() + assert result == "Async page content" - with patch("httpx.get", side_effect=httpx.RequestError("connection failed")): - result = tools.get_llms_txt_index("https://example.com/llms.txt") - assert "Failed to fetch" in result +@pytest.mark.asyncio +async def test_aread_knowledge_delegates(tools_with_knowledge): + tools_with_knowledge.knowledge.ainsert = AsyncMock() + result = await tools_with_knowledge.aread_llms_txt_and_load_knowledge("https://example.com/llms.txt") -class TestReadLLMsTxtUrl: - def test_returns_page_content(self): - tools = LLMsTxtTools() + tools_with_knowledge.knowledge.ainsert.assert_called_once_with( + url="https://example.com/llms.txt", reader=tools_with_knowledge.reader + ) + assert "Successfully loaded" in result - mock_response = MagicMock() - mock_response.headers = {"content-type": "text/plain"} - mock_response.text = "Page content here" - mock_response.raise_for_status = MagicMock() - with patch("httpx.get", return_value=mock_response): - result = tools.read_llms_txt_url("https://docs.acme.com/introduction") +# ============================================================================ +# TOOLKIT: KNOWLEDGE +# ============================================================================ - assert result == "Page content here" - def test_returns_error_on_fetch_failure(self): - tools = LLMsTxtTools() +def test_knowledge_delegates_to_insert(tools_with_knowledge): + result = tools_with_knowledge.read_llms_txt_and_load_knowledge("https://example.com/llms.txt") - with patch("httpx.get", side_effect=httpx.RequestError("connection failed")): - result = tools.read_llms_txt_url("https://example.com/missing") + tools_with_knowledge.knowledge.insert.assert_called_once_with( + url="https://example.com/llms.txt", reader=tools_with_knowledge.reader + ) + assert "Successfully loaded" in result - assert "Failed to fetch" in result +def test_knowledge_no_knowledge(tools): + result = tools.read_llms_txt_and_load_knowledge("https://example.com/llms.txt") + assert result == "Knowledge base not provided" -class TestLoadKnowledge: - def test_delegates_to_knowledge_insert(self): - mock_knowledge = MagicMock() - tools = LLMsTxtTools(knowledge=mock_knowledge) - result = tools.read_llms_txt_and_load_knowledge("https://example.com/llms.txt") +def test_knowledge_error_handling(tools_with_knowledge): + tools_with_knowledge.knowledge.insert.side_effect = RuntimeError("db connection failed") - mock_knowledge.insert.assert_called_once_with(url="https://example.com/llms.txt", reader=tools.reader) - assert "Successfully loaded" in result + result = tools_with_knowledge.read_llms_txt_and_load_knowledge("https://example.com/llms.txt") - def test_returns_message_when_no_knowledge(self): - tools = LLMsTxtTools() - result = tools.read_llms_txt_and_load_knowledge("https://example.com/llms.txt") - assert result == "Knowledge base not provided" + assert "Error" in result + assert "RuntimeError" in result From 4908f397fa1aab74bfddf5efc31f3e9fdcdd2c00 Mon Sep 17 00:00:00 2001 From: Mustafa Esoofally Date: Fri, 10 Apr 2026 12:33:51 -0400 Subject: [PATCH 20/23] fix: use ContentType.URL to decide pre-download skip in Knowledge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous fix (skip pre-download when any custom reader is provided) broke PDFReader and other file-based readers that need BytesIO. Now we check if the reader supports ContentType.URL — only URL-based readers like LLMsTxtReader and WebsiteReader skip the pre-download. File-based readers (PDFReader, CSVReader, etc.) still get pre-downloaded bytes. --- libs/agno/agno/knowledge/knowledge.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/libs/agno/agno/knowledge/knowledge.py b/libs/agno/agno/knowledge/knowledge.py index dd01927627..bcb01dcbe2 100644 --- a/libs/agno/agno/knowledge/knowledge.py +++ b/libs/agno/agno/knowledge/knowledge.py @@ -17,6 +17,7 @@ from agno.knowledge.content import Content, ContentAuth, ContentStatus, FileData from agno.knowledge.document import Document from agno.knowledge.reader import Reader, ReaderFactory +from agno.knowledge.types import ContentType from agno.knowledge.remote_content.base import BaseStorageConfig from agno.knowledge.remote_content.remote_content import ( RemoteContent, @@ -1564,9 +1565,14 @@ async def _aload_from_url( file_extension = url_path.suffix.lower() bytes_content = None - # Skip pre-download when a custom reader is provided — it knows how to - # handle the URL directly (e.g. LLMsTxtReader fetches linked pages) - if file_extension and not content.reader: + # Skip pre-download when a custom URL-based reader is provided — + # it handles the URL directly (e.g. LLMsTxtReader fetches linked pages) + skip_download = ( + content.reader is not None + and hasattr(content.reader, "get_supported_content_types") + and ContentType.URL in content.reader.get_supported_content_types() + ) + if file_extension and not skip_download: async with AsyncClient() as client: response = await async_fetch_with_retry(content.url, client=client) bytes_content = BytesIO(response.content) @@ -1718,9 +1724,14 @@ def _load_from_url( file_extension = url_path.suffix.lower() bytes_content = None - # Skip pre-download when a custom reader is provided — it knows how to - # handle the URL directly (e.g. LLMsTxtReader fetches linked pages) - if file_extension and not content.reader: + # Skip pre-download when a custom URL-based reader is provided — + # it handles the URL directly (e.g. LLMsTxtReader fetches linked pages) + skip_download = ( + content.reader is not None + and hasattr(content.reader, "get_supported_content_types") + and ContentType.URL in content.reader.get_supported_content_types() + ) + if file_extension and not skip_download: response = fetch_with_retry(content.url) bytes_content = BytesIO(response.content) From b075bae7ce10151db1e046f4fd90a04900dac347 Mon Sep 17 00:00:00 2001 From: Mustafa Esoofally Date: Fri, 10 Apr 2026 12:51:28 -0400 Subject: [PATCH 21/23] fix: preserve httpx defaults when timeout/follow_redirects not specified Only forward timeout and follow_redirects to httpx when explicitly passed by the caller. Previously, default values (timeout=None, follow_redirects=False) were always forwarded, which removed httpx's built-in 5s timeout and overrode client-level redirect settings. --- libs/agno/agno/utils/http.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/libs/agno/agno/utils/http.py b/libs/agno/agno/utils/http.py index 833650dd4d..5f2cbe59ba 100644 --- a/libs/agno/agno/utils/http.py +++ b/libs/agno/agno/utils/http.py @@ -180,18 +180,18 @@ def fetch_with_retry( backoff_factor: int = DEFAULT_BACKOFF_FACTOR, proxy: Optional[str] = None, timeout: Optional[int] = None, - follow_redirects: bool = False, + follow_redirects: Optional[bool] = None, ) -> httpx.Response: """Synchronous HTTP GET with retry logic.""" for attempt in range(max_retries): try: - response = httpx.get( - url, - proxy=proxy, - follow_redirects=follow_redirects, - timeout=timeout, # type: ignore[arg-type] - ) + kwargs: dict = {"proxy": proxy} + if timeout is not None: + kwargs["timeout"] = timeout + if follow_redirects is not None: + kwargs["follow_redirects"] = follow_redirects + response = httpx.get(url, **kwargs) response.raise_for_status() return response except httpx.RequestError as e: @@ -215,16 +215,22 @@ async def async_fetch_with_retry( backoff_factor: int = DEFAULT_BACKOFF_FACTOR, proxy: Optional[str] = None, timeout: Optional[int] = None, - follow_redirects: bool = False, + follow_redirects: Optional[bool] = None, ) -> httpx.Response: """Asynchronous HTTP GET with retry logic.""" async def _fetch(): + kwargs: dict = {} + if timeout is not None: + kwargs["timeout"] = timeout + if follow_redirects is not None: + kwargs["follow_redirects"] = follow_redirects + if client is None: async with httpx.AsyncClient(proxy=proxy) as local_client: - return await local_client.get(url, follow_redirects=follow_redirects, timeout=timeout) # type: ignore[arg-type] + return await local_client.get(url, **kwargs) else: - return await client.get(url, follow_redirects=follow_redirects, timeout=timeout) # type: ignore[arg-type] + return await client.get(url, **kwargs) for attempt in range(max_retries): try: From 4b6fb1d593b3025ef070f63d6a6963eeb94a9313 Mon Sep 17 00:00:00 2001 From: Mustafa Esoofally Date: Fri, 10 Apr 2026 12:51:47 -0400 Subject: [PATCH 22/23] fix: use Optional for new http util params, fix import order follow_redirects and timeout use Optional[None] default so existing callers see zero behavior change. Build kwargs dict conditionally instead of type-ignore comments. Import order fixed by format.sh. --- libs/agno/agno/knowledge/knowledge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/agno/agno/knowledge/knowledge.py b/libs/agno/agno/knowledge/knowledge.py index bcb01dcbe2..89780edde8 100644 --- a/libs/agno/agno/knowledge/knowledge.py +++ b/libs/agno/agno/knowledge/knowledge.py @@ -17,12 +17,12 @@ from agno.knowledge.content import Content, ContentAuth, ContentStatus, FileData from agno.knowledge.document import Document from agno.knowledge.reader import Reader, ReaderFactory -from agno.knowledge.types import ContentType from agno.knowledge.remote_content.base import BaseStorageConfig from agno.knowledge.remote_content.remote_content import ( RemoteContent, ) from agno.knowledge.remote_knowledge import RemoteKnowledge +from agno.knowledge.types import ContentType from agno.knowledge.utils import merge_user_metadata, set_agno_metadata, strip_agno_metadata from agno.utils.http import async_fetch_with_retry from agno.utils.log import log_debug, log_error, log_info, log_warning From 32b0aacbabdbe6a42f86fc43965acf78fd781e64 Mon Sep 17 00:00:00 2001 From: Mustafa Esoofally Date: Wed, 15 Apr 2026 15:36:49 -0400 Subject: [PATCH 23/23] wip: checkpoint LLMs.txt local review-round history + stray test file --- libs/agno/tests/unit/os/routers/test_sort_order_default.py | 1 - 1 file changed, 1 deletion(-) diff --git a/libs/agno/tests/unit/os/routers/test_sort_order_default.py b/libs/agno/tests/unit/os/routers/test_sort_order_default.py index 856d625ad2..1843e6f148 100644 --- a/libs/agno/tests/unit/os/routers/test_sort_order_default.py +++ b/libs/agno/tests/unit/os/routers/test_sort_order_default.py @@ -15,7 +15,6 @@ from agno.os.schema import SortOrder - # --------------------------------------------------------------------------- # Helpers – create mock DB / Knowledge with only the methods each router needs # ---------------------------------------------------------------------------