youtube-transcript-streamlit-ui/app.py at main · romilmonpara/youtube-transcript-streamlit-ui · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
# Title: Enhanced YouTube Transcript Summarizer with Sentiment Analysis
# Name: Monpara Romil Kamleshbhai
# Enroll No.: 23002170210064

import streamlit as st
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
from pytube import YouTube
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from transformers import pipeline
from textblob import TextBlob
from collections import Counter
import re
import nltk
import base64
import pandas as pd
import matplotlib.pyplot as plt
from typing import List, Dict, Union

# Constants
MAX_TEXT_LENGTH = 4000  # For summarization
MIN_TEXT_LENGTH = 50    # Minimum for processing
DEFAULT_SUMMARY_LENGTH = 500

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')

@st.cache_resource
def load_models():
    """Load all ML models once and cache them"""
    return {
        'summarizer': pipeline("summarization", model="t5-small", framework="tf"),
        'sentiment_analyzer': SentimentIntensityAnalyzer()
    }

def summarize_text(text: str, summarizer, max_length: int = DEFAULT_SUMMARY_LENGTH) -> str:
    """
    Summarizes text using T5 model

    Args:
        text (str): Input text to summarize
        summarizer: Loaded summarization pipeline
        max_length (int): Maximum length of summary

    Returns:
        str: Generated summary or error message
    """
    if len(text) < MIN_TEXT_LENGTH:
        return "Text is too short to summarize."

    text = text[:MAX_TEXT_LENGTH]  # Limit for summarization model

    try:
        summary = summarizer(
            text,
            max_length=min(len(text)//2, max_length),
            min_length=30,
            do_sample=False
        )
        return summary[0]['summary_text']
    except Exception as e:
        return f"Error in summarization: {str(e)}"

def extract_keywords(text: str) -> List[str]:
    """
    Extract keywords using TF-IDF with n-grams

    Args:
        text (str): Input text

    Returns:
        List[str]: Top keywords
    """
    try:
        vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
        X = vectorizer.fit_transform([text])
        features = vectorizer.get_feature_names_out()
        scores = X.toarray()[0]
        return [features[i] for i in scores.argsort()[-5:][::-1]]
    except:
        # Fallback to simple method if TF-IDF fails
        lemmatizer = WordNetLemmatizer()
        words = re.findall(r'\b\w+\b', text.lower())
        words = [lemmatizer.lemmatize(word) for word in words
                if word not in stopwords.words('english') and len(word) > 1]
        return [word for word, _ in Counter(words).most_common(5)]

def topic_modeling(text: str) -> List[List[str]]:
    """
    Perform topic modeling on text using LDA

    Args:
        text (str): Input text

    Returns:
        List[List[str]]: Detected topics with top words
    """
    sentences = sent_tokenize(text)
    if len(sentences) < 3:
        return [["Not enough text for topic modeling"]]

    try:
        vectorizer = CountVectorizer(max_df=0.9, min_df=1, stop_words='english')
        tf = vectorizer.fit_transform(sentences)

        if tf.shape[1] == 0:
            return [["Not enough unique words for topic modeling"]]

        lda_model = LatentDirichletAllocation(
            n_components=min(3, len(sentences)),
            max_iter=5,
            learning_method='online',
            random_state=42
        )
        lda_model.fit(tf)

        feature_names = vectorizer.get_feature_names_out()
        topics = []
        for topic_idx, topic in enumerate(lda_model.components_):
            topics.append([feature_names[i] for i in topic.argsort()[:-6:-1]])

        return topics
    except Exception as e:
        return [[f"Topic modeling failed: {str(e)}"]]

def extract_video_id(url: str) -> Union[str, None]:
    """
    Extract video ID from various YouTube URL formats

    Args:
        url (str): YouTube URL

    Returns:
        str or None: Extracted video ID or None if invalid
    """
    patterns = [
        r'v=([^&]+)',
        r'youtu.be/([^?]+)',
        r'youtube.com/embed/([^?]+)',
        r'youtube.com/shorts/([^?]+)'
    ]
    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    return None

def get_video_metadata(video_id: str) -> Dict[str, Union[str, int]]:
    """
    Get basic metadata about the YouTube video

    Args:
        video_id (str): YouTube video ID

    Returns:
        Dict: Video metadata (title, author, etc.)
    """
    try:
        yt = YouTube(f"https://youtu.be/{video_id}")
        return {
            'title': yt.title,
            'author': yt.author,
            'length': f"{yt.length // 60}:{yt.length % 60:02d}",
            'views': f"{yt.views:,}"
        }
    except:
        return {}

def create_download_link(text: str, filename: str, label: str) -> str:
    """
    Create a downloadable link for text content

    Args:
        text (str): Content to download
        filename (str): Name for downloaded file
        label (str): Display text for link

    Returns:
        str: HTML download link
    """
    b64 = base64.b64encode(text.encode()).decode()
    return f'<a href="data:file/txt;base64,{b64}" download="{filename}">{label}</a>'

def sentiment_visualization(textblob_sent: Dict, vader_sent: Dict) -> None:
    """
    Create visualizations for sentiment analysis results

    Args:
        textblob_sent (Dict): TextBlob sentiment results
        vader_sent (Dict): VADER sentiment results
    """
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

    ax1.bar(['Polarity', 'Subjectivity'],
            [textblob_sent['polarity'], textblob_sent['subjectivity']],
            color=['skyblue', 'salmon'])
    ax1.set_ylim(-1, 1)
    ax1.set_title('TextBlob Sentiment')

    vader_scores = {k: v for k, v in vader_sent.items() if k != 'compound'}
    ax2.bar(vader_scores.keys(), vader_scores.values())
    ax2.set_ylim(0, 1)
    ax2.set_title('VADER Sentiment')

    plt.tight_layout()
    st.pyplot(fig)

def main():
    """Main Streamlit application"""
    st.title("🎬 Enhanced YouTube Video Summarizer")
    st.markdown("""
    This tool extracts and summarizes YouTube video transcripts, providing:
    - 📝 Concise summary
    - 🔑 Key keywords and topics
    - 😊 Sentiment analysis
    - 📥 Export options
    """)

    # Load models once
    models = load_models()

    # Sidebar for additional options
    with st.sidebar:
        st.header("Settings")
        max_summary_length = st.slider(
            "Max Summary Length:",
            100, 2000, DEFAULT_SUMMARY_LENGTH
        )
        show_details = st.checkbox("Show detailed processing", False)

    video_url = st.text_input("Enter YouTube Video URL:", "")

    if st.button("Analyze Video"):
        if not video_url.strip():
            st.warning("Please enter a YouTube URL")
            return

        with st.spinner('Processing video...'):
            try:
                video_id = extract_video_id(video_url)
                if not video_id:
                    st.error("Invalid YouTube URL. Please enter a valid URL.")
                    return

                metadata = get_video_metadata(video_id)
                if metadata:
                    st.subheader(metadata['title'])
                    st.caption(f"👤 {metadata['author']} | ⏱️ {metadata['length']} | 👀 {metadata['views']} views")

                transcript = YouTubeTranscriptApi.get_transcript(video_id)
                if not transcript:
                    st.error("Transcript not available for this video.")
                    return

                video_text = ' '.join([line['text'] for line in transcript])
                if not video_text.strip():
                    st.error("Transcript appears to be empty.")
                    return

                if show_details:
                    with st.expander("Raw Transcript"):
                        st.text(video_text[:2000] + ("..." if len(video_text) > 2000 else ""))

                summary = summarize_text(video_text, models['summarizer'], max_summary_length)
                keywords = extract_keywords(video_text)
                topics = topic_modeling(video_text)

                blob_sentiment = TextBlob(video_text).sentiment
                vader_sentiment = models['sentiment_analyzer'].polarity_scores(video_text)

                st.subheader("📝 Summary")
                st.write(summary)

                st.subheader("🔑 Top Keywords")
                st.write(", ".join(keywords))

                st.subheader("🗂️ Detected Topics")
                for idx, topic in enumerate(topics):
                    st.write(f"Topic {idx+1}: {', '.join(topic)}")

                st.subheader("😊 Sentiment Analysis")
                sentiment_visualization(
                    {'polarity': blob_sentiment.polarity,
                     'subjectivity': blob_sentiment.subjectivity},
                    vader_sentiment
                )

                st.subheader("📥 Export Options")

                # Text export
                st.markdown(create_download_link(
                    summary,
                    "summary.txt",
                    "Download Summary as TXT"
                ), unsafe_allow_html=True)

                # CSV export
                export_data = pd.DataFrame({
                    "Summary": [summary],
                    "Keywords": [', '.join(keywords)],
                    "Topics": [' | '.join([', '.join(t) for t in topics])],
                    "Polarity": [blob_sentiment.polarity],
                    "Subjectivity": [blob_sentiment.subjectivity],
                    "VADER_Positive": [vader_sentiment['pos']],
                    "VADER_Negative": [vader_sentiment['neg']],
                    "VADER_Neutral": [vader_sentiment['neu']],
                    "VADER_Compound": [vader_sentiment['compound']]
                })
                csv = export_data.to_csv(index=False)
                b64 = base64.b64encode(csv.encode()).decode()
                href = f'<a href="data:file/csv;base64,{b64}" download="analysis.csv">Download Full Analysis as CSV</a>'
                st.markdown(href, unsafe_allow_html=True)

            except TranscriptsDisabled:
                st.error("Transcripts are disabled for this video.")
            except NoTranscriptFound:
                st.error("No transcript found for this video.")
            except Exception as e:
                st.error(f"An error occurred: {str(e)}")
                if "CUDA out of memory" in str(e):
                    st.info("Try a shorter video or reduce the summary length")

if __name__ == "__main__":
    main()