diff --git a/cookbook/91_tools/llms_txt_tools.py b/cookbook/91_tools/llms_txt_tools.py new file mode 100644 index 0000000000..cb2379c2ec --- /dev/null +++ b/cookbook/91_tools/llms_txt_tools.py @@ -0,0 +1,42 @@ +""" +LLMs.txt Tools - Agentic Documentation Discovery +============================= + +Demonstrates how to use LLMsTxtTools in agentic mode where the agent: +1. Reads the llms.txt index to discover available documentation pages +2. Decides which pages are relevant to the user's question +3. Fetches only the specific pages it needs + +The llms.txt format (https://llmstxt.org) is a standardized way for websites +to provide LLM-friendly documentation indexes. +""" + +from agno.agent import Agent +from agno.models.openai import OpenAIResponses +from agno.tools.llms_txt import LLMsTxtTools + +# --------------------------------------------------------------------------- +# Create Agent +# --------------------------------------------------------------------------- + +agent = Agent( + model=OpenAIResponses(id="gpt-5.4"), + tools=[LLMsTxtTools()], + instructions=[ + "You can read llms.txt files to discover documentation for any project.", + "First use get_llms_txt_index to see what pages are available.", + "Then use read_llms_txt_url to fetch only the pages relevant to the user's question.", + ], + markdown=True, +) + +# --------------------------------------------------------------------------- +# Run Agent +# --------------------------------------------------------------------------- +if __name__ == "__main__": + agent.print_response( + "Using the llms.txt at https://docs.agno.com/llms.txt, " + "find and read the documentation about how to create an agent with tools", + markdown=True, + stream=True, + ) diff --git a/cookbook/91_tools/llms_txt_tools_knowledge.py b/cookbook/91_tools/llms_txt_tools_knowledge.py new file mode 100644 index 0000000000..ce1c131f99 --- /dev/null +++ b/cookbook/91_tools/llms_txt_tools_knowledge.py @@ -0,0 +1,56 @@ +""" +LLMs.txt Tools with Knowledge Base +============================= + +Demonstrates loading all documentation from an llms.txt file into a knowledge base +for retrieval-augmented generation (RAG). + +The agent reads the llms.txt index, fetches all linked documentation pages, +and stores them in a PgVector knowledge base for semantic search. +""" + +from agno.agent import Agent +from agno.knowledge.knowledge import Knowledge +from agno.models.openai import OpenAIResponses +from agno.tools.llms_txt import LLMsTxtTools +from agno.vectordb.pgvector import PgVector + +# --------------------------------------------------------------------------- +# Setup Knowledge Base +# --------------------------------------------------------------------------- + +db_url = "postgresql+psycopg://ai:ai@localhost:5532/ai" + +knowledge = Knowledge( + vector_db=PgVector( + table_name="llms_txt_docs", + db_url=db_url, + ), +) + +# --------------------------------------------------------------------------- +# Create Agent +# --------------------------------------------------------------------------- + +agent = Agent( + model=OpenAIResponses(id="gpt-5.4"), + knowledge=knowledge, + search_knowledge=True, + tools=[LLMsTxtTools(knowledge=knowledge, max_urls=20)], + instructions=[ + "You can load documentation from llms.txt files into your knowledge base.", + "When asked about a project, first load its llms.txt into the knowledge base, then answer questions.", + ], + markdown=True, +) + +# --------------------------------------------------------------------------- +# Run Agent +# --------------------------------------------------------------------------- +if __name__ == "__main__": + agent.print_response( + "Load the documentation from https://docs.agno.com/llms.txt into the knowledge base, " + "then tell me how to create an agent with Agno", + markdown=True, + stream=True, + ) diff --git a/libs/agno/agno/knowledge/knowledge.py b/libs/agno/agno/knowledge/knowledge.py index 200018d1b4..89780edde8 100644 --- a/libs/agno/agno/knowledge/knowledge.py +++ b/libs/agno/agno/knowledge/knowledge.py @@ -22,6 +22,7 @@ RemoteContent, ) from agno.knowledge.remote_knowledge import RemoteKnowledge +from agno.knowledge.types import ContentType from agno.knowledge.utils import merge_user_metadata, set_agno_metadata, strip_agno_metadata from agno.utils.http import async_fetch_with_retry from agno.utils.log import log_debug, log_error, log_info, log_warning @@ -1564,7 +1565,14 @@ async def _aload_from_url( file_extension = url_path.suffix.lower() bytes_content = None - if file_extension: + # Skip pre-download when a custom URL-based reader is provided — + # it handles the URL directly (e.g. LLMsTxtReader fetches linked pages) + skip_download = ( + content.reader is not None + and hasattr(content.reader, "get_supported_content_types") + and ContentType.URL in content.reader.get_supported_content_types() + ) + if file_extension and not skip_download: async with AsyncClient() as client: response = await async_fetch_with_retry(content.url, client=client) bytes_content = BytesIO(response.content) @@ -1716,7 +1724,14 @@ def _load_from_url( file_extension = url_path.suffix.lower() bytes_content = None - if file_extension: + # Skip pre-download when a custom URL-based reader is provided — + # it handles the URL directly (e.g. LLMsTxtReader fetches linked pages) + skip_download = ( + content.reader is not None + and hasattr(content.reader, "get_supported_content_types") + and ContentType.URL in content.reader.get_supported_content_types() + ) + if file_extension and not skip_download: response = fetch_with_retry(content.url) bytes_content = BytesIO(response.content) diff --git a/libs/agno/agno/knowledge/reader/llms_txt_reader.py b/libs/agno/agno/knowledge/reader/llms_txt_reader.py new file mode 100644 index 0000000000..4be8058b6e --- /dev/null +++ b/libs/agno/agno/knowledge/reader/llms_txt_reader.py @@ -0,0 +1,243 @@ +import asyncio +import re +import uuid +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple +from urllib.parse import urljoin + +import httpx + +try: + from bs4 import BeautifulSoup # noqa: F401 +except ImportError: + raise ImportError("The `bs4` package is not installed. Please install it via `pip install beautifulsoup4`.") + +from agno.knowledge.chunking.fixed import FixedSizeChunking +from agno.knowledge.chunking.strategy import ChunkingStrategy, ChunkingStrategyType +from agno.knowledge.document.base import Document +from agno.knowledge.reader.base import Reader +from agno.knowledge.types import ContentType +from agno.utils.http import async_fetch_with_retry, fetch_with_retry +from agno.utils.log import log_debug, log_error, log_warning + +_LINK_PATTERN = re.compile(r"-\s+\[([^\]]+)\]\(([^)]+)\)(?::\s*(.+))?") +_SECTION_PATTERN = re.compile(r"^##\s+(.+)$", re.MULTILINE) + + +@dataclass +class LLMsTxtEntry: + title: str + url: str + description: str + section: str + + +class LLMsTxtReader(Reader): + """Reader for llms.txt files (see https://llmstxt.org). + + Example: + reader = LLMsTxtReader(max_urls=20) + documents = reader.read("https://docs.example.com/llms.txt") + """ + + def __init__( + self, + chunking_strategy: Optional[ChunkingStrategy] = None, + max_urls: int = 20, + timeout: int = 60, + proxy: Optional[str] = None, + skip_optional: bool = False, + **kwargs, + ): + if chunking_strategy is None: + chunk_size = kwargs.get("chunk_size", 5000) + chunking_strategy = FixedSizeChunking(chunk_size=chunk_size) + super().__init__(chunking_strategy=chunking_strategy, **kwargs) + self.max_urls = max_urls + self.timeout = timeout + self.proxy = proxy + self.skip_optional = skip_optional + + @classmethod + def get_supported_chunking_strategies(cls) -> List[ChunkingStrategyType]: + return [ + ChunkingStrategyType.FIXED_SIZE_CHUNKER, + ChunkingStrategyType.AGENTIC_CHUNKER, + ChunkingStrategyType.DOCUMENT_CHUNKER, + ChunkingStrategyType.RECURSIVE_CHUNKER, + ChunkingStrategyType.SEMANTIC_CHUNKER, + ] + + @classmethod + def get_supported_content_types(cls) -> List[ContentType]: + return [ContentType.URL] + + # Helpers + + def _process_response(self, content_type: str, text: str) -> str: + if any(t in content_type for t in ["text/plain", "text/markdown"]): + return text + + if "text/html" in content_type or text.strip().startswith((" List[Document]: + documents: List[Document] = [] + + if overview: + doc = Document( + name=name or llms_txt_url, + id=str(uuid.uuid4()), + meta_data={"url": llms_txt_url, "type": "llms_txt_overview"}, + content=overview, + ) + if self.chunk: + documents.extend(self.chunk_document(doc)) + else: + documents.append(doc) + + for entry in entries: + content = fetched.get(entry.url) + if not content: + continue + + doc = Document( + name=entry.title, + id=str(uuid.uuid4()), + meta_data={ + "url": entry.url, + "section": entry.section, + "description": entry.description, + "type": "llms_txt_linked_doc", + }, + content=content, + ) + if self.chunk: + documents.extend(self.chunk_document(doc)) + else: + documents.append(doc) + + return documents + + # Public methods + + def parse_llms_txt(self, content: str, base_url: str) -> Tuple[str, List[LLMsTxtEntry]]: + entries: List[LLMsTxtEntry] = [] + current_section = "" + overview_lines: List[str] = [] + + for line in content.split("\n"): + section_match = _SECTION_PATTERN.match(line) + if section_match: + current_section = section_match.group(1).strip() + elif not current_section: + overview_lines.append(line) + elif self.skip_optional and current_section.lower() == "optional": + pass + else: + link_match = _LINK_PATTERN.match(line.strip()) + if link_match: + url = link_match.group(2).strip() + if not url.startswith(("http://", "https://")): + url = urljoin(base_url, url) + entries.append( + LLMsTxtEntry( + title=link_match.group(1).strip(), + url=url, + description=(link_match.group(3) or "").strip(), + section=current_section, + ) + ) + + overview = "\n".join(overview_lines).strip() + return overview, entries + + def fetch_url(self, url: str) -> Optional[str]: + try: + response = fetch_with_retry( + url, max_retries=1, proxy=self.proxy, timeout=self.timeout, follow_redirects=True + ) + return self._process_response(response.headers.get("content-type", ""), response.text) + except Exception as e: + log_warning(f"Failed to fetch {url}: {e}") + return None + + async def async_fetch_url(self, client: httpx.AsyncClient, url: str) -> Optional[str]: + try: + response = await async_fetch_with_retry( + url, client=client, max_retries=1, timeout=self.timeout, follow_redirects=True + ) + return self._process_response(response.headers.get("content-type", ""), response.text) + except Exception as e: + log_warning(f"Failed to fetch {url}: {e}") + return None + + def read(self, url: str, name: Optional[str] = None) -> List[Document]: + log_debug(f"Reading llms.txt: {url}") + llms_txt_content = self.fetch_url(url) + if not llms_txt_content: + log_error(f"Failed to fetch llms.txt from {url}") + return [] + + overview, entries = self.parse_llms_txt(llms_txt_content, url) + log_debug(f"Found {len(entries)} linked URLs in llms.txt") + + entries_to_fetch = entries[: self.max_urls] + if len(entries) > self.max_urls: + log_warning(f"Limiting to {self.max_urls} URLs (found {len(entries)})") + + fetched: Dict[str, str] = {} + for entry in entries_to_fetch: + content = self.fetch_url(entry.url) + if content: + fetched[entry.url] = content + + log_debug(f"Successfully fetched {len(fetched)}/{len(entries_to_fetch)} linked pages") + return self._build_documents(overview, entries_to_fetch, fetched, url, name) + + async def async_read(self, url: str, name: Optional[str] = None) -> List[Document]: + log_debug(f"Reading llms.txt asynchronously: {url}") + async with httpx.AsyncClient(proxy=self.proxy) as client: + llms_txt_content = await self.async_fetch_url(client, url) + if not llms_txt_content: + log_error(f"Failed to fetch llms.txt from {url}") + return [] + + overview, entries = self.parse_llms_txt(llms_txt_content, url) + log_debug(f"Found {len(entries)} linked URLs in llms.txt") + + entries_to_fetch = entries[: self.max_urls] + if len(entries) > self.max_urls: + log_warning(f"Limiting to {self.max_urls} URLs (found {len(entries)})") + + # httpx AsyncClient limits concurrent connections per host (default 20) + async def _fetch_entry(entry: LLMsTxtEntry) -> Tuple[str, Optional[str]]: + content = await self.async_fetch_url(client, entry.url) + return entry.url, content + + results = await asyncio.gather(*[_fetch_entry(e) for e in entries_to_fetch]) + fetched: Dict[str, str] = {entry_url: content for entry_url, content in results if content} + + log_debug(f"Successfully fetched {len(fetched)}/{len(entries_to_fetch)} linked pages") + return self._build_documents(overview, entries_to_fetch, fetched, url, name) diff --git a/libs/agno/agno/knowledge/reader/reader_factory.py b/libs/agno/agno/knowledge/reader/reader_factory.py index 92548f4df0..a5aefa3bd0 100644 --- a/libs/agno/agno/knowledge/reader/reader_factory.py +++ b/libs/agno/agno/knowledge/reader/reader_factory.py @@ -76,6 +76,10 @@ class ReaderFactory: "name": "WebSearchReader", "description": "Executes web searches and processes results with relevance ranking and content extraction", }, + "llms_txt": { + "name": "LLMsTxtReader", + "description": "Reads llms.txt files, discovers linked documentation URLs, and fetches their content", + }, "docling": { "name": "DoclingReader", "description": "Converts multiple document formats like PDF, DOCX, PPTX, images, HTML, etc. using IBM's Docling library", @@ -279,6 +283,18 @@ def _get_web_search_reader(cls, **kwargs) -> Reader: config.update(kwargs) return WebSearchReader(**config) + @classmethod + def _get_llms_txt_reader(cls, **kwargs) -> Reader: + """Get LLMs Text reader instance.""" + from agno.knowledge.reader.llms_txt_reader import LLMsTxtReader + + config: Dict[str, Any] = { + "name": "LLMs Text Reader", + "description": "Reads llms.txt files, discovers linked documentation URLs, and fetches their content", + } + config.update(kwargs) + return LLMsTxtReader(**config) + @classmethod def _get_docling_reader(cls, **kwargs) -> Reader: """Get Docling reader instance.""" @@ -334,6 +350,7 @@ def get_reader_class(cls, reader_key: str) -> type: "arxiv": ("agno.knowledge.reader.arxiv_reader", "ArxivReader"), "wikipedia": ("agno.knowledge.reader.wikipedia_reader", "WikipediaReader"), "web_search": ("agno.knowledge.reader.web_search_reader", "WebSearchReader"), + "llms_txt": ("agno.knowledge.reader.llms_txt_reader", "LLMsTxtReader"), "docling": ("agno.knowledge.reader.docling_reader", "DoclingReader"), } diff --git a/libs/agno/agno/tools/llms_txt.py b/libs/agno/agno/tools/llms_txt.py new file mode 100644 index 0000000000..cd33a0b0be --- /dev/null +++ b/libs/agno/agno/tools/llms_txt.py @@ -0,0 +1,191 @@ +import json +from typing import Callable, List, Optional + +import httpx + +from agno.knowledge.knowledge import Knowledge +from agno.knowledge.reader.llms_txt_reader import LLMsTxtReader +from agno.tools import Toolkit +from agno.utils.log import log_debug, log_info + + +class LLMsTxtTools(Toolkit): + def __init__( + self, + knowledge: Optional[Knowledge] = None, + max_urls: int = 20, + timeout: int = 60, + skip_optional: bool = False, + **kwargs, + ): + self.knowledge: Optional[Knowledge] = knowledge + self.max_urls = max_urls + self.timeout = timeout + self.skip_optional = skip_optional + self.reader = LLMsTxtReader( + max_urls=max_urls, + timeout=timeout, + skip_optional=skip_optional, + ) + + tools: List[Callable] = [] + async_tools_list: List[tuple] = [] + # Agentic mode — agent picks which pages to read + if self.knowledge is None: + tools.append(self.get_llms_txt_index) + tools.append(self.read_llms_txt_url) + async_tools_list.append((self.aget_llms_txt_index, "get_llms_txt_index")) + async_tools_list.append((self.aread_llms_txt_url, "read_llms_txt_url")) + # Knowledge mode — bulk load all pages into vector DB + else: + tools.append(self.read_llms_txt_and_load_knowledge) + async_tools_list.append((self.aread_llms_txt_and_load_knowledge, "read_llms_txt_and_load_knowledge")) + + super().__init__(name="llms_txt_tools", tools=tools, async_tools=async_tools_list, **kwargs) + + # Helpers + + def _async_client(self) -> httpx.AsyncClient: + return httpx.AsyncClient(timeout=self.timeout, proxy=self.reader.proxy) + + def _format_index(self, overview: str, entries: list) -> str: + return json.dumps( + { + "overview": overview, + "pages": [ + {"title": e.title, "url": e.url, "description": e.description, "section": e.section} + for e in entries + ], + "total_pages": len(entries), + } + ) + + # Tools + + def get_llms_txt_index(self, url: str) -> str: + """ + Reads an llms.txt file and returns the index of all available documentation pages. + Use this to discover what pages are available, then use read_llms_txt_url to fetch specific pages. + + Args: + url (str): The URL of the llms.txt file (e.g. https://docs.example.com/llms.txt) + + Returns: + str: JSON with the overview and list of available documentation pages + """ + try: + log_info(f"Reading llms.txt index from {url}") + llms_txt_content = self.reader.fetch_url(url) + if not llms_txt_content: + return f"Failed to fetch llms.txt from {url}" + + overview, entries = self.reader.parse_llms_txt(llms_txt_content, url) + return self._format_index(overview, entries) + except Exception as e: + return f"Error reading llms.txt index from {url}: {type(e).__name__}: {e}" + + async def aget_llms_txt_index(self, url: str) -> str: + """ + Reads an llms.txt file and returns the index of all available documentation pages. + Use this to discover what pages are available, then use read_llms_txt_url to fetch specific pages. + + Args: + url (str): The URL of the llms.txt file (e.g. https://docs.example.com/llms.txt) + + Returns: + str: JSON with the overview and list of available documentation pages + """ + try: + log_info(f"Reading llms.txt index from {url}") + async with self._async_client() as client: + llms_txt_content = await self.reader.async_fetch_url(client, url) + + if not llms_txt_content: + return f"Failed to fetch llms.txt from {url}" + + overview, entries = self.reader.parse_llms_txt(llms_txt_content, url) + return self._format_index(overview, entries) + except Exception as e: + return f"Error reading llms.txt index from {url}: {type(e).__name__}: {e}" + + def read_llms_txt_url(self, url: str) -> str: + """ + Fetches and returns the content of a specific documentation page. + Use this after calling get_llms_txt_index to read pages relevant to the user's question. + + Args: + url (str): The URL of the documentation page to read + + Returns: + str: The text content of the page + """ + try: + log_debug(f"Fetching URL: {url}") + content = self.reader.fetch_url(url) + if not content: + return f"Failed to fetch content from {url}" + return content + except Exception as e: + return f"Error fetching {url}: {type(e).__name__}: {e}" + + async def aread_llms_txt_url(self, url: str) -> str: + """ + Fetches and returns the content of a specific documentation page. + Use this after calling get_llms_txt_index to read pages relevant to the user's question. + + Args: + url (str): The URL of the documentation page to read + + Returns: + str: The text content of the page + """ + try: + log_debug(f"Fetching URL: {url}") + async with self._async_client() as client: + content = await self.reader.async_fetch_url(client, url) + + if not content: + return f"Failed to fetch content from {url}" + return content + except Exception as e: + return f"Error fetching {url}: {type(e).__name__}: {e}" + + def read_llms_txt_and_load_knowledge(self, url: str) -> str: + """ + Reads an llms.txt file, fetches all linked pages, and loads them into the knowledge base. + + Args: + url (str): The URL of the llms.txt file (e.g. https://docs.example.com/llms.txt) + + Returns: + str: Summary of what was loaded into the knowledge base + """ + if self.knowledge is None: + return "Knowledge base not provided" + + try: + log_info(f"Reading llms.txt from {url}") + self.knowledge.insert(url=url, reader=self.reader) + return f"Successfully loaded documentation from {url} into the knowledge base" + except Exception as e: + return f"Error loading knowledge from {url}: {type(e).__name__}: {e}" + + async def aread_llms_txt_and_load_knowledge(self, url: str) -> str: + """ + Reads an llms.txt file, fetches all linked pages, and loads them into the knowledge base. + + Args: + url (str): The URL of the llms.txt file (e.g. https://docs.example.com/llms.txt) + + Returns: + str: Summary of what was loaded into the knowledge base + """ + if self.knowledge is None: + return "Knowledge base not provided" + + try: + log_info(f"Reading llms.txt from {url}") + await self.knowledge.ainsert(url=url, reader=self.reader) + return f"Successfully loaded documentation from {url} into the knowledge base" + except Exception as e: + return f"Error loading knowledge from {url}: {type(e).__name__}: {e}" diff --git a/libs/agno/agno/utils/http.py b/libs/agno/agno/utils/http.py index ca887b3e83..5f2cbe59ba 100644 --- a/libs/agno/agno/utils/http.py +++ b/libs/agno/agno/utils/http.py @@ -179,12 +179,19 @@ def fetch_with_retry( max_retries: int = DEFAULT_MAX_RETRIES, backoff_factor: int = DEFAULT_BACKOFF_FACTOR, proxy: Optional[str] = None, + timeout: Optional[int] = None, + follow_redirects: Optional[bool] = None, ) -> httpx.Response: """Synchronous HTTP GET with retry logic.""" for attempt in range(max_retries): try: - response = httpx.get(url, proxy=proxy) if proxy else httpx.get(url) + kwargs: dict = {"proxy": proxy} + if timeout is not None: + kwargs["timeout"] = timeout + if follow_redirects is not None: + kwargs["follow_redirects"] = follow_redirects + response = httpx.get(url, **kwargs) response.raise_for_status() return response except httpx.RequestError as e: @@ -198,7 +205,7 @@ def fetch_with_retry( logger.exception(f"HTTP error for {url}: {e.response.status_code} - {e.response.text}") raise - raise httpx.RequestError(f"Failed to fetch {url} after {max_retries} attempts") + raise httpx.RequestError(f"Failed to fetch {url} after {max_retries} attempts") # type: ignore[call-arg] async def async_fetch_with_retry( @@ -207,16 +214,23 @@ async def async_fetch_with_retry( max_retries: int = DEFAULT_MAX_RETRIES, backoff_factor: int = DEFAULT_BACKOFF_FACTOR, proxy: Optional[str] = None, + timeout: Optional[int] = None, + follow_redirects: Optional[bool] = None, ) -> httpx.Response: """Asynchronous HTTP GET with retry logic.""" async def _fetch(): + kwargs: dict = {} + if timeout is not None: + kwargs["timeout"] = timeout + if follow_redirects is not None: + kwargs["follow_redirects"] = follow_redirects + if client is None: - client_args = {"proxy": proxy} if proxy else {} - async with httpx.AsyncClient(**client_args) as local_client: # type: ignore - return await local_client.get(url) + async with httpx.AsyncClient(proxy=proxy) as local_client: + return await local_client.get(url, **kwargs) else: - return await client.get(url) + return await client.get(url, **kwargs) for attempt in range(max_retries): try: @@ -234,4 +248,4 @@ async def _fetch(): logger.exception(f"HTTP error for {url}: {e.response.status_code} - {e.response.text}") raise - raise httpx.RequestError(f"Failed to fetch {url} after {max_retries} attempts") + raise httpx.RequestError(f"Failed to fetch {url} after {max_retries} attempts") # type: ignore[call-arg] diff --git a/libs/agno/tests/unit/os/routers/test_sort_order_default.py b/libs/agno/tests/unit/os/routers/test_sort_order_default.py index 856d625ad2..1843e6f148 100644 --- a/libs/agno/tests/unit/os/routers/test_sort_order_default.py +++ b/libs/agno/tests/unit/os/routers/test_sort_order_default.py @@ -15,7 +15,6 @@ from agno.os.schema import SortOrder - # --------------------------------------------------------------------------- # Helpers – create mock DB / Knowledge with only the methods each router needs # --------------------------------------------------------------------------- diff --git a/libs/agno/tests/unit/tools/test_llms_txt.py b/libs/agno/tests/unit/tools/test_llms_txt.py new file mode 100644 index 0000000000..f4e3c47b0f --- /dev/null +++ b/libs/agno/tests/unit/tools/test_llms_txt.py @@ -0,0 +1,542 @@ +"""Unit tests for LLMsTxtTools and LLMsTxtReader.""" + +import json +from unittest.mock import AsyncMock, MagicMock, Mock, patch + +import httpx +import pytest + +bs4 = pytest.importorskip("bs4") + +from agno.knowledge.reader.llms_txt_reader import LLMsTxtEntry, LLMsTxtReader # noqa: E402 +from agno.tools.llms_txt import LLMsTxtTools # noqa: E402 + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +SAMPLE_LLMS_TXT = """# Acme Project + +> Acme is a framework for building AI applications. + +Acme makes it easy to build production-ready AI agents. + +## Getting Started + +- [Introduction](https://docs.acme.com/introduction): Overview of Acme +- [Installation](https://docs.acme.com/installation): How to install Acme +- [Quickstart](https://docs.acme.com/quickstart): Build your first agent + +## API Reference + +- [Agent API](https://docs.acme.com/api/agent): Agent class reference +- [Tools API](https://docs.acme.com/api/tools): Tools class reference + +## Optional + +- [Changelog](https://docs.acme.com/changelog): Release notes +- [Contributing](https://docs.acme.com/contributing): How to contribute +""" + +SAMPLE_LLMS_TXT_RELATIVE = """# My Project + +> A project with relative links. + +## Docs + +- [Guide](/docs/guide): The guide +- [API](api/reference): API docs +""" + + +@pytest.fixture +def reader(): + return LLMsTxtReader(chunk=False) + + +@pytest.fixture +def tools(): + return LLMsTxtTools() + + +@pytest.fixture +def tools_with_knowledge(): + mock_knowledge = MagicMock() + return LLMsTxtTools(knowledge=mock_knowledge) + + +def _mock_httpx_response(text: str, content_type: str = "text/plain") -> Mock: + resp = Mock() + resp.headers = {"content-type": content_type} + resp.text = text + resp.raise_for_status = Mock() + return resp + + +# ============================================================================ +# READER: INIT +# ============================================================================ + + +def test_reader_defaults(): + reader = LLMsTxtReader() + assert reader.max_urls == 20 + assert reader.timeout == 60 + assert reader.proxy is None + assert reader.skip_optional is False + + +def test_reader_custom_params(): + reader = LLMsTxtReader(max_urls=50, timeout=10, skip_optional=True) + assert reader.max_urls == 50 + assert reader.timeout == 10 + assert reader.skip_optional is True + + +# ============================================================================ +# READER: PARSE +# ============================================================================ + + +def test_parse_entries(reader): + overview, entries = reader.parse_llms_txt(SAMPLE_LLMS_TXT, "https://docs.acme.com/llms.txt") + + assert len(entries) == 7 + assert entries[0].title == "Introduction" + assert entries[0].url == "https://docs.acme.com/introduction" + assert entries[0].description == "Overview of Acme" + assert entries[0].section == "Getting Started" + + +def test_parse_overview(reader): + overview, _ = reader.parse_llms_txt(SAMPLE_LLMS_TXT, "https://docs.acme.com/llms.txt") + + assert "# Acme Project" in overview + assert "Acme makes it easy" in overview + + +def test_parse_sections(reader): + _, entries = reader.parse_llms_txt(SAMPLE_LLMS_TXT, "https://docs.acme.com/llms.txt") + + sections = {e.section for e in entries} + assert sections == {"Getting Started", "API Reference", "Optional"} + + +def test_parse_skip_optional(): + reader = LLMsTxtReader(skip_optional=True) + _, entries = reader.parse_llms_txt(SAMPLE_LLMS_TXT, "https://docs.acme.com/llms.txt") + + assert len(entries) == 5 + assert all(e.section != "Optional" for e in entries) + + +def test_parse_relative_urls(reader): + _, entries = reader.parse_llms_txt(SAMPLE_LLMS_TXT_RELATIVE, "https://example.com/llms.txt") + + assert entries[0].url == "https://example.com/docs/guide" + assert entries[1].url == "https://example.com/api/reference" + + +def test_parse_empty_content(reader): + overview, entries = reader.parse_llms_txt("", "https://example.com/llms.txt") + + assert overview == "" + assert entries == [] + + +def test_parse_no_links(reader): + content = "# Title\n\nSome overview text.\n\n## Section\n\nNo links here." + overview, entries = reader.parse_llms_txt(content, "https://example.com/llms.txt") + + assert "# Title" in overview + assert entries == [] + + +# ============================================================================ +# READER: PROCESS RESPONSE +# ============================================================================ + + +def test_process_response_plain_text(reader): + result = reader._process_response("text/plain", "Plain text content") + assert result == "Plain text content" + + +def test_process_response_markdown(reader): + result = reader._process_response("text/markdown", "# Heading\n\nBody") + assert result == "# Heading\n\nBody" + + +def test_process_response_html_extracts_main(reader): + html = "
Main content here
" + result = reader._process_response("text/html", html) + assert "Main content here" in result + assert "Nav" not in result + + +def test_process_response_html_body_fallback(reader): + html = "
Body content
" + result = reader._process_response("text/html", html) + assert "Body content" in result + + +def test_process_response_strips_scripts(reader): + html = "

Text

" + result = reader._process_response("text/html", html) + assert "var x" not in result + assert "Text" in result + + +def test_process_response_newline_separator(reader): + html = "

First paragraph

Second paragraph

" + result = reader._process_response("text/html", html) + assert "First paragraph" in result + assert "Second paragraph" in result + assert "\n" in result + + +def test_process_response_html_sniffing(reader): + """HTML detected by content prefix when content-type header is missing.""" + result = reader._process_response("", "

Sniffed

") + assert "Sniffed" in result + + +def test_process_response_unknown_content_type(reader): + """Unknown content-type returns raw text.""" + result = reader._process_response("application/json", '{"key": "value"}') + assert result == '{"key": "value"}' + + +# ============================================================================ +# READER: FETCH +# ============================================================================ + + +def test_fetch_url_plain_content(reader): + mock_response = _mock_httpx_response("Plain text content", "text/plain") + + with patch("agno.utils.http.httpx.get", return_value=mock_response): + result = reader.fetch_url("https://example.com/file.txt") + + assert result == "Plain text content" + + +def test_fetch_url_html_content(reader): + mock_response = _mock_httpx_response("
Extracted
", "text/html") + + with patch("agno.utils.http.httpx.get", return_value=mock_response): + result = reader.fetch_url("https://example.com/page") + + assert "Extracted" in result + + +def test_fetch_url_http_error(reader): + with patch( + "agno.utils.http.httpx.get", + side_effect=httpx.HTTPStatusError("error", request=MagicMock(), response=MagicMock(status_code=404)), + ): + result = reader.fetch_url("https://example.com/missing") + + assert result is None + + +def test_fetch_url_request_error(reader): + with patch("agno.utils.http.httpx.get", side_effect=httpx.RequestError("connection failed")): + result = reader.fetch_url("https://example.com/down") + + assert result is None + + +# ============================================================================ +# READER: BUILD DOCUMENTS +# ============================================================================ + + +def test_build_documents_overview_and_linked(reader): + entries = [ + LLMsTxtEntry(title="Intro", url="https://example.com/intro", description="Intro page", section="Docs"), + ] + fetched = {"https://example.com/intro": "Introduction content here"} + + docs = reader._build_documents("Overview text", entries, fetched, "https://example.com/llms.txt", None) + + assert len(docs) == 2 + assert docs[0].meta_data["type"] == "llms_txt_overview" + assert docs[0].content == "Overview text" + assert docs[1].meta_data["type"] == "llms_txt_linked_doc" + assert docs[1].name == "Intro" + assert docs[1].content == "Introduction content here" + + +def test_build_documents_skips_unfetched(reader): + entries = [ + LLMsTxtEntry(title="Missing", url="https://example.com/missing", description="", section="Docs"), + ] + docs = reader._build_documents("Overview", entries, {}, "https://example.com/llms.txt", None) + + assert len(docs) == 1 + assert docs[0].meta_data["type"] == "llms_txt_overview" + + +def test_build_documents_empty_overview(reader): + entries = [ + LLMsTxtEntry(title="Page", url="https://example.com/page", description="", section="Docs"), + ] + fetched = {"https://example.com/page": "Page content"} + + docs = reader._build_documents("", entries, fetched, "https://example.com/llms.txt", None) + + assert len(docs) == 1 + assert docs[0].meta_data["type"] == "llms_txt_linked_doc" + + +# ============================================================================ +# READER: READ +# ============================================================================ + + +def test_read_fetches_and_builds(): + reader = LLMsTxtReader(max_urls=5, chunk=False) + + def mock_fetch(url): + if url == "https://example.com/llms.txt": + return SAMPLE_LLMS_TXT + return f"Content of {url}" + + with patch.object(reader, "fetch_url", side_effect=mock_fetch): + docs = reader.read("https://example.com/llms.txt") + + assert len(docs) == 6 + assert docs[0].meta_data["type"] == "llms_txt_overview" + + +def test_read_returns_empty_on_failure(): + reader = LLMsTxtReader() + + with patch.object(reader, "fetch_url", return_value=None): + docs = reader.read("https://example.com/llms.txt") + + assert docs == [] + + +def test_read_max_urls_limits(): + reader = LLMsTxtReader(max_urls=2, chunk=False) + + def mock_fetch(url): + if url == "https://example.com/llms.txt": + return SAMPLE_LLMS_TXT + return f"Content of {url}" + + with patch.object(reader, "fetch_url", side_effect=mock_fetch): + docs = reader.read("https://example.com/llms.txt") + + assert len(docs) == 3 + + +# ============================================================================ +# READER: ASYNC READ +# ============================================================================ + + +@pytest.mark.asyncio +async def test_async_read_fetches_concurrently(): + reader = LLMsTxtReader(max_urls=3, chunk=False) + + async def mock_async_fetch(client, url): + if "llms.txt" in url: + return SAMPLE_LLMS_TXT + return f"Content of {url}" + + with patch.object(reader, "async_fetch_url", side_effect=mock_async_fetch): + docs = await reader.async_read("https://example.com/llms.txt") + + assert len(docs) == 4 + assert docs[0].meta_data["type"] == "llms_txt_overview" + + +@pytest.mark.asyncio +async def test_async_read_returns_empty_on_failure(): + reader = LLMsTxtReader() + + async def mock_async_fetch(client, url): + return None + + with patch.object(reader, "async_fetch_url", side_effect=mock_async_fetch): + docs = await reader.async_read("https://example.com/llms.txt") + + assert docs == [] + + +# ============================================================================ +# TOOLKIT: INIT +# ============================================================================ + + +def test_toolkit_agentic_tools(tools): + func_names = [func.name for func in tools.functions.values()] + assert "get_llms_txt_index" in func_names + assert "read_llms_txt_url" in func_names + assert "read_llms_txt_and_load_knowledge" not in func_names + + +def test_toolkit_async_tools(tools): + async_func_names = [func.name for func in tools.async_functions.values()] + assert "get_llms_txt_index" in async_func_names + assert "read_llms_txt_url" in async_func_names + + +def test_toolkit_knowledge_tools(tools_with_knowledge): + func_names = [func.name for func in tools_with_knowledge.functions.values()] + assert "read_llms_txt_and_load_knowledge" in func_names + assert "get_llms_txt_index" not in func_names + + +def test_toolkit_knowledge_async_tools(tools_with_knowledge): + async_func_names = [func.name for func in tools_with_knowledge.async_functions.values()] + assert "read_llms_txt_and_load_knowledge" in async_func_names + + +def test_toolkit_custom_params(): + t = LLMsTxtTools(max_urls=50, timeout=10, skip_optional=True) + assert t.max_urls == 50 + assert t.timeout == 10 + assert t.skip_optional is True + + +def test_toolkit_reader_reuse(tools): + assert tools.reader is not None + assert tools.reader.timeout == tools.timeout + assert tools.reader.max_urls == tools.max_urls + + +# ============================================================================ +# TOOLKIT: GET INDEX +# ============================================================================ + + +def test_get_index_returns_json(tools): + mock_response = _mock_httpx_response(SAMPLE_LLMS_TXT, "text/plain") + + with patch("agno.utils.http.httpx.get", return_value=mock_response): + result = tools.get_llms_txt_index("https://docs.acme.com/llms.txt") + + data = json.loads(result) + assert data["total_pages"] == 7 + assert data["pages"][0]["title"] == "Introduction" + assert data["pages"][0]["url"] == "https://docs.acme.com/introduction" + assert "overview" in data + + +def test_get_index_failure(tools): + with patch("agno.utils.http.httpx.get", side_effect=httpx.RequestError("connection failed")): + result = tools.get_llms_txt_index("https://example.com/llms.txt") + + assert "Failed to fetch" in result + + +def test_get_index_error_handling(tools): + with patch.object(tools.reader, "fetch_url", side_effect=RuntimeError("unexpected")): + result = tools.get_llms_txt_index("https://example.com/llms.txt") + + assert "Error" in result + assert "RuntimeError" in result + + +# ============================================================================ +# TOOLKIT: READ URL +# ============================================================================ + + +def test_read_url_returns_content(tools): + mock_response = _mock_httpx_response("Page content here", "text/plain") + + with patch("agno.utils.http.httpx.get", return_value=mock_response): + result = tools.read_llms_txt_url("https://docs.acme.com/introduction") + + assert result == "Page content here" + + +def test_read_url_failure(tools): + with patch("agno.utils.http.httpx.get", side_effect=httpx.RequestError("connection failed")): + result = tools.read_llms_txt_url("https://example.com/missing") + + assert "Failed to fetch" in result + + +# ============================================================================ +# TOOLKIT: ASYNC TOOLS +# ============================================================================ + + +@pytest.mark.asyncio +async def test_aget_index_returns_json(tools): + mock_response = _mock_httpx_response(SAMPLE_LLMS_TXT, "text/plain") + + mock_client = AsyncMock() + mock_client.get.return_value = mock_response + + with patch("agno.tools.llms_txt.httpx.AsyncClient") as mock_async_client: + mock_async_client.return_value.__aenter__ = AsyncMock(return_value=mock_client) + mock_async_client.return_value.__aexit__ = AsyncMock(return_value=False) + + result = await tools.aget_llms_txt_index("https://docs.acme.com/llms.txt") + + data = json.loads(result) + assert data["total_pages"] == 7 + assert data["pages"][0]["title"] == "Introduction" + + +@pytest.mark.asyncio +async def test_aread_url_returns_content(tools): + mock_response = _mock_httpx_response("Async page content", "text/plain") + + mock_client = AsyncMock() + mock_client.get.return_value = mock_response + + with patch("agno.tools.llms_txt.httpx.AsyncClient") as mock_async_client: + mock_async_client.return_value.__aenter__ = AsyncMock(return_value=mock_client) + mock_async_client.return_value.__aexit__ = AsyncMock(return_value=False) + + result = await tools.aread_llms_txt_url("https://docs.acme.com/page") + + assert result == "Async page content" + + +@pytest.mark.asyncio +async def test_aread_knowledge_delegates(tools_with_knowledge): + tools_with_knowledge.knowledge.ainsert = AsyncMock() + + result = await tools_with_knowledge.aread_llms_txt_and_load_knowledge("https://example.com/llms.txt") + + tools_with_knowledge.knowledge.ainsert.assert_called_once_with( + url="https://example.com/llms.txt", reader=tools_with_knowledge.reader + ) + assert "Successfully loaded" in result + + +# ============================================================================ +# TOOLKIT: KNOWLEDGE +# ============================================================================ + + +def test_knowledge_delegates_to_insert(tools_with_knowledge): + result = tools_with_knowledge.read_llms_txt_and_load_knowledge("https://example.com/llms.txt") + + tools_with_knowledge.knowledge.insert.assert_called_once_with( + url="https://example.com/llms.txt", reader=tools_with_knowledge.reader + ) + assert "Successfully loaded" in result + + +def test_knowledge_no_knowledge(tools): + result = tools.read_llms_txt_and_load_knowledge("https://example.com/llms.txt") + assert result == "Knowledge base not provided" + + +def test_knowledge_error_handling(tools_with_knowledge): + tools_with_knowledge.knowledge.insert.side_effect = RuntimeError("db connection failed") + + result = tools_with_knowledge.read_llms_txt_and_load_knowledge("https://example.com/llms.txt") + + assert "Error" in result + assert "RuntimeError" in result