Skip to content

YouTube to Blog Converter

Difficulty: Beginner-Intermediate
Time: 30-45 minutes
Learning Focus: API integration, text processing, content repurposing
Module: chat

Overview

Create a tool that converts YouTube video transcripts into well-formatted blog posts using AI. This project helps students understand how to extract data from one format and transform it into another valuable piece of content.

Instructions

import re
import argparse
from youtube_transcript_api import YouTubeTranscriptApi
from hands_on_ai.chat import get_response

def youtube_to_blog():
    """Convert YouTube video transcripts into blog post content using AI."""
    print("=== YouTube to Blog Converter ===")
    print("This tool extracts a transcript from a YouTube video and converts it to a blog post.")

    # Get video URL or ID
    video_input = input("Enter YouTube video URL or video ID: ")

    # Extract video ID from URL if needed
    video_id = extract_video_id(video_input)

    if not video_id:
        print("Error: Could not extract a valid YouTube video ID.")
        return

    print(f"Processing video ID: {video_id}")

    # Get transcript
    try:
        transcript = get_transcript(video_id)
        if not transcript:
            return
    except Exception as e:
        print(f"Error getting transcript: {e}")
        return

    # Get video metadata if needed
    video_title = input("Enter video title (or press Enter to skip): ")
    video_author = input("Enter video creator/channel name (or press Enter to skip): ")

    # Get blog style preferences
    print("\nBlog Style Options:")
    print("1. Informational/Educational")
    print("2. Conversational/Casual")
    print("3. Professional/Formal")
    print("4. Tutorial/How-To")

    style_choice = input("Select a style (1-4): ")

    if style_choice == "1":
        blog_style = "informational"
    elif style_choice == "2":
        blog_style = "conversational"
    elif style_choice == "3":
        blog_style = "professional"
    elif style_choice == "4":
        blog_style = "tutorial"
    else:
        print("Invalid choice. Using informational style.")
        blog_style = "informational"

    # Get additional context
    topic_keywords = input("Enter 3-5 keywords related to the video (comma separated): ")

    # Get intended audience
    audience = input("Who is the target audience for this blog post? ")

    # Generate blog post with AI
    generate_blog_post(transcript, video_title, video_author, blog_style, topic_keywords, audience)

def extract_video_id(video_input):
    """Extract the YouTube video ID from a URL or return the ID if already provided."""
    # Check if it's already a video ID (simple 11-character string)
    if re.match(r'^[a-zA-Z0-9_-]{11}, video_input):
        return video_input

    # Try to extract from URL
    youtube_regex = r'(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?)\/|\S*?[?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})'
    match = re.search(youtube_regex, video_input)

    if match:
        return match.group(1)

    return None

def get_transcript(video_id):
    """Get the transcript from a YouTube video."""
    try:
        transcript_list = YouTubeTranscriptApi.get_transcript(video_id)

        # Combine all transcript segments
        full_transcript = " ".join([item['text'] for item in transcript_list])

        # Print a preview
        preview_length = min(150, len(full_transcript))
        print(f"\nTranscript preview ({len(full_transcript)} characters):")
        print(f"{full_transcript[:preview_length]}...")

        return full_transcript

    except Exception as e:
        print(f"Error: Could not retrieve transcript. {str(e)}")
        print("Possible reasons:")
        print("- The video might not have closed captions/subtitles")
        print("- The video ID might be incorrect")
        print("- The video owner may have disabled transcript access")
        return None

def generate_blog_post(transcript, title, author, style, keywords, audience):
    """Generate a blog post from the transcript using AI."""
    print("\nGenerating blog post...")

    # Create a prompt for the AI
    prompt = f"""
    Convert this YouTube video transcript into a well-structured blog post:

    VIDEO INFORMATION:
    Title: {title if title else "Not provided"}
    Creator: {author if author else "Not provided"}
    Style: {style}
    Keywords: {keywords}
    Target Audience: {audience}

    TRANSCRIPT:
    {transcript[:4000]}  # Limit transcript length if needed

    Please create a complete blog post with:
    1. An engaging headline/title
    2. Introduction that hooks the reader
    3. Well-structured sections with subheadings
    4. Conclusion or call-to-action
    5. Add relevant statistics or examples where appropriate

    Format the post using markdown syntax for headings, lists, etc.
    The tone should be {style} and appropriate for the specified audience.
    Expand on any concepts from the video that need more explanation.
    Add 3-5 relevant tags at the end of the post.

    Length: Aim for ~1000-1500 words.
    """

    try:
        # Generate blog post using the prompt
        blog_post = get_response(prompt)

        # Save the blog post to a file
        filename = f"blog_post_{title.replace(' ', '_')[:30] if title else 'from_youtube'}.md"
        with open(filename, 'w') as f:
            f.write(blog_post)

        print(f"\nBlog post successfully generated and saved to: {filename}")

        # Print a preview of the blog post
        preview_lines = blog_post.split('\n')[:10]
        print("\nBlog Post Preview:")
        print("\n".join(preview_lines) + "\n...")

    except Exception as e:
        print(f"Error generating blog post: {e}")

if __name__ == "__main__":
    youtube_to_blog()

Extension Ideas

  • Add support for multiple languages and translation
  • Implement a social media post generator from the same content
  • Create a scheduler to process videos in batch
  • Add image extraction from video thumbnails or frames
  • Build a web interface with Flask or Streamlit
  • Implement SEO optimisation suggestions for the generated content