Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
fa6efc4
feat: add LLMsTxtReader and LLMsTxtTools for llms.txt support
ashpreetbedi Apr 10, 2026
f59dfe5
Merge branch 'main' into feat/llms-txt-reader-tools
kausmeows Apr 10, 2026
13884e7
Merge branch 'main' into feat/llms-txt-reader-tools
kausmeows Apr 10, 2026
f2a9101
fix: address review issues in LLMsTxtReader and LLMsTxtTools (#7465)
ysolanky Apr 10, 2026
c9d6789
Merge branch 'main' into feat/llms-txt-reader-tools
Mustafa-Esoofally Apr 10, 2026
482cd73
fix: improve LLMsTxtTools async patterns and deduplicate reader logic
Mustafa-Esoofally Apr 10, 2026
f35ab19
fix: delegate knowledge loading to Knowledge.insert() pipeline
Mustafa-Esoofally Apr 10, 2026
8bb61d7
fix: simplify reader, delegate to Knowledge pipeline, remove dead code
Mustafa-Esoofally Apr 10, 2026
7d88c44
fix: remove include_llms_txt_content parameter — always include overview
Mustafa-Esoofally Apr 10, 2026
4474d95
fix: clean up reader — remove init docstring, simplify parser
Mustafa-Esoofally Apr 10, 2026
9039720
fix: inline _extract_content into _process_response
Mustafa-Esoofally Apr 10, 2026
d6becc8
fix: simplify fetch_url — collapse 3 except blocks into 1
Mustafa-Esoofally Apr 10, 2026
5ed981b
fix: remove module-level constant, inline semaphore with WHY comment
Mustafa-Esoofally Apr 10, 2026
a5fe2a3
fix: reuse fetch_with_retry utils instead of raw httpx calls
Mustafa-Esoofally Apr 10, 2026
62e75c0
fix: change defaults — max_urls=20, timeout=60
Mustafa-Esoofally Apr 10, 2026
1d8312f
fix: move imports to module level — bs4 and LLMsTxtReader
Mustafa-Esoofally Apr 10, 2026
2ae73c2
fix: remove class docstring and WHAT comment
Mustafa-Esoofally Apr 10, 2026
8bdd061
fix: clean up toolkit — trim docstrings, simplify helpers, add sections
Mustafa-Esoofally Apr 10, 2026
a252b2f
fix: match Gmail toolkit docstring pattern — Args/Returns style
Mustafa-Esoofally Apr 10, 2026
bc918b0
fix: add try/except to all tools, reorder methods — helpers then public
Mustafa-Esoofally Apr 10, 2026
beea0b0
fix: replace Any with proper types
Mustafa-Esoofally Apr 10, 2026
7ebc5a2
test: rewrite tests following Perplexity/Gmail pattern — 46 tests
Mustafa-Esoofally Apr 10, 2026
4908f39
fix: use ContentType.URL to decide pre-download skip in Knowledge
Mustafa-Esoofally Apr 10, 2026
b075bae
fix: preserve httpx defaults when timeout/follow_redirects not specified
Mustafa-Esoofally Apr 10, 2026
4b6fb1d
fix: use Optional for new http util params, fix import order
Mustafa-Esoofally Apr 10, 2026
32b0aac
wip: checkpoint LLMs.txt local review-round history + stray test file
Mustafa-Esoofally Apr 15, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions cookbook/91_tools/llms_txt_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""
LLMs.txt Tools - Agentic Documentation Discovery
=============================

Demonstrates how to use LLMsTxtTools in agentic mode where the agent:
1. Reads the llms.txt index to discover available documentation pages
2. Decides which pages are relevant to the user's question
3. Fetches only the specific pages it needs

The llms.txt format (https://llmstxt.org) is a standardized way for websites
to provide LLM-friendly documentation indexes.
"""

from agno.agent import Agent
from agno.models.openai import OpenAIResponses
from agno.tools.llms_txt import LLMsTxtTools

# ---------------------------------------------------------------------------
# Create Agent
# ---------------------------------------------------------------------------

agent = Agent(
model=OpenAIResponses(id="gpt-5.4"),
tools=[LLMsTxtTools()],
instructions=[
"You can read llms.txt files to discover documentation for any project.",
"First use get_llms_txt_index to see what pages are available.",
"Then use read_llms_txt_url to fetch only the pages relevant to the user's question.",
],
markdown=True,
)

# ---------------------------------------------------------------------------
# Run Agent
# ---------------------------------------------------------------------------
if __name__ == "__main__":
agent.print_response(
"Using the llms.txt at https://docs.agno.com/llms.txt, "
"find and read the documentation about how to create an agent with tools",
markdown=True,
stream=True,
)
56 changes: 56 additions & 0 deletions cookbook/91_tools/llms_txt_tools_knowledge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""
LLMs.txt Tools with Knowledge Base
=============================

Demonstrates loading all documentation from an llms.txt file into a knowledge base
for retrieval-augmented generation (RAG).

The agent reads the llms.txt index, fetches all linked documentation pages,
and stores them in a PgVector knowledge base for semantic search.
"""

from agno.agent import Agent
from agno.knowledge.knowledge import Knowledge
from agno.models.openai import OpenAIResponses
from agno.tools.llms_txt import LLMsTxtTools
from agno.vectordb.pgvector import PgVector

# ---------------------------------------------------------------------------
# Setup Knowledge Base
# ---------------------------------------------------------------------------

db_url = "postgresql+psycopg://ai:ai@localhost:5532/ai"

knowledge = Knowledge(
vector_db=PgVector(
table_name="llms_txt_docs",
db_url=db_url,
),
)

# ---------------------------------------------------------------------------
# Create Agent
# ---------------------------------------------------------------------------

agent = Agent(
model=OpenAIResponses(id="gpt-5.4"),
knowledge=knowledge,
search_knowledge=True,
tools=[LLMsTxtTools(knowledge=knowledge, max_urls=20)],
instructions=[
"You can load documentation from llms.txt files into your knowledge base.",
"When asked about a project, first load its llms.txt into the knowledge base, then answer questions.",
],
markdown=True,
)

# ---------------------------------------------------------------------------
# Run Agent
# ---------------------------------------------------------------------------
if __name__ == "__main__":
agent.print_response(
"Load the documentation from https://docs.agno.com/llms.txt into the knowledge base, "
"then tell me how to create an agent with Agno",
markdown=True,
stream=True,
)
19 changes: 17 additions & 2 deletions libs/agno/agno/knowledge/knowledge.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
RemoteContent,
)
from agno.knowledge.remote_knowledge import RemoteKnowledge
from agno.knowledge.types import ContentType
from agno.knowledge.utils import merge_user_metadata, set_agno_metadata, strip_agno_metadata
from agno.utils.http import async_fetch_with_retry
from agno.utils.log import log_debug, log_error, log_info, log_warning
Expand Down Expand Up @@ -1564,7 +1565,14 @@ async def _aload_from_url(
file_extension = url_path.suffix.lower()

bytes_content = None
if file_extension:
# Skip pre-download when a custom URL-based reader is provided —
# it handles the URL directly (e.g. LLMsTxtReader fetches linked pages)
skip_download = (
content.reader is not None
and hasattr(content.reader, "get_supported_content_types")
and ContentType.URL in content.reader.get_supported_content_types()
)
if file_extension and not skip_download:
async with AsyncClient() as client:
response = await async_fetch_with_retry(content.url, client=client)
bytes_content = BytesIO(response.content)
Expand Down Expand Up @@ -1716,7 +1724,14 @@ def _load_from_url(
file_extension = url_path.suffix.lower()

bytes_content = None
if file_extension:
# Skip pre-download when a custom URL-based reader is provided —
# it handles the URL directly (e.g. LLMsTxtReader fetches linked pages)
skip_download = (
content.reader is not None
and hasattr(content.reader, "get_supported_content_types")
and ContentType.URL in content.reader.get_supported_content_types()
)
if file_extension and not skip_download:
response = fetch_with_retry(content.url)
bytes_content = BytesIO(response.content)

Expand Down
243 changes: 243 additions & 0 deletions libs/agno/agno/knowledge/reader/llms_txt_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
import asyncio
import re
import uuid
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
from urllib.parse import urljoin

import httpx

try:
from bs4 import BeautifulSoup # noqa: F401
except ImportError:
raise ImportError("The `bs4` package is not installed. Please install it via `pip install beautifulsoup4`.")

from agno.knowledge.chunking.fixed import FixedSizeChunking
from agno.knowledge.chunking.strategy import ChunkingStrategy, ChunkingStrategyType
from agno.knowledge.document.base import Document
from agno.knowledge.reader.base import Reader
from agno.knowledge.types import ContentType
from agno.utils.http import async_fetch_with_retry, fetch_with_retry
from agno.utils.log import log_debug, log_error, log_warning

_LINK_PATTERN = re.compile(r"-\s+\[([^\]]+)\]\(([^)]+)\)(?::\s*(.+))?")
_SECTION_PATTERN = re.compile(r"^##\s+(.+)$", re.MULTILINE)


@dataclass
class LLMsTxtEntry:
title: str
url: str
description: str
section: str


class LLMsTxtReader(Reader):
"""Reader for llms.txt files (see https://llmstxt.org).

Example:
reader = LLMsTxtReader(max_urls=20)
documents = reader.read("https://docs.example.com/llms.txt")
"""

def __init__(
self,
chunking_strategy: Optional[ChunkingStrategy] = None,
max_urls: int = 20,
timeout: int = 60,
proxy: Optional[str] = None,
skip_optional: bool = False,
**kwargs,
):
if chunking_strategy is None:
chunk_size = kwargs.get("chunk_size", 5000)
chunking_strategy = FixedSizeChunking(chunk_size=chunk_size)
super().__init__(chunking_strategy=chunking_strategy, **kwargs)
self.max_urls = max_urls
self.timeout = timeout
self.proxy = proxy
self.skip_optional = skip_optional

@classmethod
def get_supported_chunking_strategies(cls) -> List[ChunkingStrategyType]:
return [
ChunkingStrategyType.FIXED_SIZE_CHUNKER,
ChunkingStrategyType.AGENTIC_CHUNKER,
ChunkingStrategyType.DOCUMENT_CHUNKER,
ChunkingStrategyType.RECURSIVE_CHUNKER,
ChunkingStrategyType.SEMANTIC_CHUNKER,
]

@classmethod
def get_supported_content_types(cls) -> List[ContentType]:
return [ContentType.URL]

# Helpers

def _process_response(self, content_type: str, text: str) -> str:
if any(t in content_type for t in ["text/plain", "text/markdown"]):
return text

if "text/html" in content_type or text.strip().startswith(("<!DOCTYPE", "<html", "<HTML")):
soup = BeautifulSoup(text, "html.parser")
for tag in soup.find_all(["script", "style", "nav", "header", "footer", "aside"]):
tag.decompose()

main = soup.find("main") or soup.find("article") or soup.find(attrs={"role": "main"})
if main:
return main.get_text(separator="\n", strip=True)

body = soup.find("body")
if body:
return body.get_text(separator="\n", strip=True)

return soup.get_text(separator="\n", strip=True)

return text

def _build_documents(
self,
overview: str,
entries: List[LLMsTxtEntry],
fetched: Dict[str, str],
llms_txt_url: str,
name: Optional[str],
) -> List[Document]:
documents: List[Document] = []

if overview:
doc = Document(
name=name or llms_txt_url,
id=str(uuid.uuid4()),
meta_data={"url": llms_txt_url, "type": "llms_txt_overview"},
content=overview,
)
if self.chunk:
documents.extend(self.chunk_document(doc))
else:
documents.append(doc)

for entry in entries:
content = fetched.get(entry.url)
if not content:
continue

doc = Document(
name=entry.title,
id=str(uuid.uuid4()),
meta_data={
"url": entry.url,
"section": entry.section,
"description": entry.description,
"type": "llms_txt_linked_doc",
},
content=content,
)
if self.chunk:
documents.extend(self.chunk_document(doc))
else:
documents.append(doc)

return documents

# Public methods

def parse_llms_txt(self, content: str, base_url: str) -> Tuple[str, List[LLMsTxtEntry]]:
entries: List[LLMsTxtEntry] = []
current_section = ""
overview_lines: List[str] = []

for line in content.split("\n"):
section_match = _SECTION_PATTERN.match(line)
if section_match:
current_section = section_match.group(1).strip()
elif not current_section:
overview_lines.append(line)
elif self.skip_optional and current_section.lower() == "optional":
pass
else:
link_match = _LINK_PATTERN.match(line.strip())
if link_match:
url = link_match.group(2).strip()
if not url.startswith(("http://", "https://")):
url = urljoin(base_url, url)
entries.append(
LLMsTxtEntry(
title=link_match.group(1).strip(),
url=url,
description=(link_match.group(3) or "").strip(),
section=current_section,
)
)

overview = "\n".join(overview_lines).strip()
return overview, entries

def fetch_url(self, url: str) -> Optional[str]:
try:
response = fetch_with_retry(
url, max_retries=1, proxy=self.proxy, timeout=self.timeout, follow_redirects=True
)
return self._process_response(response.headers.get("content-type", ""), response.text)
except Exception as e:
log_warning(f"Failed to fetch {url}: {e}")
return None

async def async_fetch_url(self, client: httpx.AsyncClient, url: str) -> Optional[str]:
try:
response = await async_fetch_with_retry(
url, client=client, max_retries=1, timeout=self.timeout, follow_redirects=True
)
return self._process_response(response.headers.get("content-type", ""), response.text)
except Exception as e:
log_warning(f"Failed to fetch {url}: {e}")
return None

def read(self, url: str, name: Optional[str] = None) -> List[Document]:
log_debug(f"Reading llms.txt: {url}")
llms_txt_content = self.fetch_url(url)
if not llms_txt_content:
log_error(f"Failed to fetch llms.txt from {url}")
return []

overview, entries = self.parse_llms_txt(llms_txt_content, url)
log_debug(f"Found {len(entries)} linked URLs in llms.txt")

entries_to_fetch = entries[: self.max_urls]
if len(entries) > self.max_urls:
log_warning(f"Limiting to {self.max_urls} URLs (found {len(entries)})")

fetched: Dict[str, str] = {}
for entry in entries_to_fetch:
content = self.fetch_url(entry.url)
if content:
fetched[entry.url] = content

log_debug(f"Successfully fetched {len(fetched)}/{len(entries_to_fetch)} linked pages")
return self._build_documents(overview, entries_to_fetch, fetched, url, name)

async def async_read(self, url: str, name: Optional[str] = None) -> List[Document]:
log_debug(f"Reading llms.txt asynchronously: {url}")
async with httpx.AsyncClient(proxy=self.proxy) as client:
llms_txt_content = await self.async_fetch_url(client, url)
if not llms_txt_content:
log_error(f"Failed to fetch llms.txt from {url}")
return []

overview, entries = self.parse_llms_txt(llms_txt_content, url)
log_debug(f"Found {len(entries)} linked URLs in llms.txt")

entries_to_fetch = entries[: self.max_urls]
if len(entries) > self.max_urls:
log_warning(f"Limiting to {self.max_urls} URLs (found {len(entries)})")

# httpx AsyncClient limits concurrent connections per host (default 20)
async def _fetch_entry(entry: LLMsTxtEntry) -> Tuple[str, Optional[str]]:
content = await self.async_fetch_url(client, entry.url)
return entry.url, content

results = await asyncio.gather(*[_fetch_entry(e) for e in entries_to_fetch])
fetched: Dict[str, str] = {entry_url: content for entry_url, content in results if content}

log_debug(f"Successfully fetched {len(fetched)}/{len(entries_to_fetch)} linked pages")
return self._build_documents(overview, entries_to_fetch, fetched, url, name)
Loading
Loading