llm-python/10_journal.py at main · beoncloud8/llm-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
from dotenv import load_dotenv
load_dotenv()

import argparse
import logging
import sys
from pathlib import Path
import os
from llama_index.core import (
    Document,
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage,
    SimpleDirectoryReader
)
from llama_index.core.node_parser import SimpleNodeParser

# to see token counter and token usage for the LLM and Embedding
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))


OBSIDIAN_DIR = os.getenv("OBSIDIAN_JOURNAL_DIR", "/Users/beoncloud/Documents/bcvault/Journal")

# Validate directory exists
if not os.path.exists(OBSIDIAN_DIR):
    raise FileNotFoundError(f"Obsidian directory not found: {OBSIDIAN_DIR}")

docs = SimpleDirectoryReader(OBSIDIAN_DIR).load_data()


def read_journal_md(file_path):
    from bs4 import BeautifulSoup
    import markdown
    import re

    try:
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()

        if not text.strip():
            print(f"Warning: Empty file {file_path}")
            return ""

        html = markdown.markdown(text)
        soup = BeautifulSoup(html, "html.parser")

        # Get all paragraph content
        ps = soup.find_all("p")

        if not ps:
            print(f"Warning: No paragraphs found in {file_path}")
            return ""

        # Combine all paragraph text
        result = " ".join([p.text for p in ps])

        print(f"Finished processing {file_path}")
        return result

    except FileNotFoundError:
        print(f"Error: File not found {file_path}")
        return ""
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return ""


def create_journal_nodes(dir_path):
    """
    Examples: https://gpt-index.readthedocs.io/en/stable/guides/primer/usage_pattern.html
    """
    docs = []
    parser = SimpleNodeParser()

    # loop through each markdown file in the directory
    try:
        for file_path in Path(dir_path).glob("*.md"):
            md = read_journal_md(file_path)
            if md.strip():  # Only add non-empty documents
                # construct documents manually using the lower level Document struct
                docs.append(Document(text=md))

        if not docs:
            print("Warning: No valid documents found")
            return [], []

        nodes = parser.get_nodes_from_documents(docs)
        return nodes, docs

    except Exception as e:
        print(f"Error processing directory {dir_path}: {str(e)}")
        return [], []

if Path("./storage").exists():
    try:
        storage_context = StorageContext.from_defaults(persist_dir="./storage")
        index = load_index_from_storage(storage_context)
        print("Loaded existing index from storage")
    except Exception as e:
        print(f"Error loading index from storage: {str(e)}")
        print("Creating new index...")
        nodes, docs = create_journal_nodes(OBSIDIAN_DIR)
        if nodes:
            index = VectorStoreIndex(nodes)
            index.storage_context.persist(persist_dir="./storage")
            print("Created and persisted new index")
        else:
            print("No nodes to create index")
            exit(1)
else:
    nodes, docs = create_journal_nodes(OBSIDIAN_DIR)
    if nodes:
        index = VectorStoreIndex(nodes)
        index.storage_context.persist(persist_dir="./storage")
        print("Created and persisted new index")
    else:
        print("No nodes to create index")
        exit(1)

if __name__ == "__main__":
    """
    Usage: python 10_journal_x.py -q "what are places I ate at in March and April?"
    """
    query_engine = index.as_query_engine()
    # cli argument parser
    parser = argparse.ArgumentParser(
        prog="QueryJournal",
        description="Query my bullet journals in Obsidian using Llama Index."
    )
    parser.add_argument(
        "-q",
        "--query",
        type=str,
        help="Ask a question answerable in my journals",
        required=True
    )
    args = parser.parse_args()
    query = args.query

    if(query):
        res = query_engine.query(query)
        print(f"Query: {query}")
        print(f"Results: \n {res}")
    else:
        print("No query provided. Exiting...")
        exit(0)