llm-python/22_agents_judge_critic.py at main · onlyphantom/llm-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""
This example shows the LLM as a judge pattern. The first agent generates a stock summary
from the research notes and the second agent evaluates the summary. The first agent is asked
to continually improve the summary until the evaluator gives a pass.

`3_research_notes.txt` is the text file generated by our previous section where our multi-agent
orchestration pattern is demonstrated.

Usage:
python 4_judge_critic.py
🤖: What company are you interested in?
👧: bbca
"""

from dotenv import load_dotenv

import asyncio
from dataclasses import dataclass
from typing import Literal

from agents import Agent, ItemHelpers, Runner, TResponseInputItem, trace, function_tool

load_dotenv()

@function_tool
def read_company_data_from_txt() -> str:
    """
    Read company data from the text file 3_research_notes.txt
    """
    try:
        with open("3_research_notes.txt", "r") as file:
            data = file.read()
            print(data)
            return data
    except FileNotFoundError:
        return "File not found. Please ensure the file exists."
    except Exception as e:
        return str(e)

read_company_data_from_txt = Agent(
    name="read_company_data_from_txt",
    instructions=(
        "Given a company name or ticker by the user, read the company data from the text file 3_research_notes.txt"
        "Summarize them into 2-3 paragraphs and be informative so it reads like a professional report."
        "If there is any feedback, incorporate them to improve the report. If the ticker is not found, say so."
    ),
    tools=[read_company_data_from_txt],
)

@dataclass
class EvaluationFeedback:
    feedback: str
    score: Literal["pass", "expect_improvement", "fail"]


evaluator = Agent[None](
    name="evaluator",
    instructions=(
        "You evaluate a stock overview summary and decide if it's good enough."
        "If it's not good enough, you provide feedback on what needs to be improved."
        "Never give it a pass on the first try, but be increasingly generous so its chance of passing increases over time."
    ),
    output_type=EvaluationFeedback,
)

async def main() -> None:
    msg = input("🤖: What company are you interested in? \n👧: ")
    input_items: list[TResponseInputItem] = [{"content": msg, "role": "user"}]

    summary: str | None = None

    # We'll run the entire workflow in a single trace
    with trace("LLM as a judge"):
        while True:
            summarized_results = await Runner.run(
                read_company_data_from_txt,
                input_items,
            )

            input_items = summarized_results.to_input_list()
            summary = ItemHelpers.text_message_outputs(summarized_results.new_items)
            print("Stock overview summary generated")

            evaluator_result = await Runner.run(evaluator, input_items)
            result: EvaluationFeedback = evaluator_result.final_output

            print(f"Evaluator score: {result.score}")

            if result.score == "pass":
                print("The stock summary is 💡 good enough, exiting.")
                break

            print("Re-running with feedback")

            input_items.append({"content": f"Feedback: {result.feedback}", "role": "user"})

    print(f"Final Summary: {summary}")
    print("Input items:", input_items)


if __name__ == "__main__":
    asyncio.run(main())