technical-screen-2025-10-22/modules/markdown_utils.py

#!/usr/bin/env python3

import re
import requests
import json


def clean_markdown_text(text):
    """Clean markdown text to ensure it's plaintext with no special characters"""
    if not text:
        return ""

    # Remove LaTeX commands and math expressions
    text = re.sub(r'\\[a-zA-Z]+\{[^}]*\}', '', text)  # Remove \command{content}
    text = re.sub(r'\$[^$]*\$', '', text)  # Remove $math$ expressions
    text = re.sub(r'\\[a-zA-Z]+', '', text)  # Remove remaining \commands

    # Remove markdown formatting but keep the text
    text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)  # Remove bold **text**
    text = re.sub(r'\*([^*]+)\*', r'\1', text)  # Remove italic *text*
    text = re.sub(r'`([^`]+)`', r'\1', text)  # Remove code `text`
    text = re.sub(r'#{1,6}\s*', '', text)  # Remove headers # ## ###

    # Remove special characters but keep basic punctuation
    text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\"\'\/\&\%\@\#\$\+\=\<\>]', ' ', text)

    # Clean up multiple spaces and newlines
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n\s*\n', '\n\n', text)

    return text.strip()


def create_slide_markdown(slide_data, analysis_results, slide_num, slide_text=""):
    """Create markdown content for a single slide with all agentic analyses and text content"""

    markdown = f"""# Slide {slide_num}

![Slide {slide_num}](slides/{slide_data['filename']})

"""

    # Add text content if available
    if slide_text and slide_text.strip():
        # Clean the slide text to ensure it's plaintext
        clean_slide_text = clean_markdown_text(slide_text)
        markdown += f"""## Text Content

{clean_slide_text}

"""

    markdown += """## Agentic Analysis

"""

    for prompt_key, result in analysis_results.items():
        # Clean the analysis text to ensure it's plaintext
        clean_analysis = clean_markdown_text(result['analysis'])

        markdown += f"""### {result['agent']}

{clean_analysis}

"""

    markdown += "---\n\n"
    return markdown


def create_text_only_markdown(markdown_content):
    """Create a text-only version of markdown without image references for API submission"""
    # Remove image markdown blocks but keep the text descriptions and analysis
    text_only = markdown_content

    # Remove image embedding lines
    text_only = re.sub(r'!\[.*?\]\(slides/.*?\)\n', '', text_only)

    # Remove image link lines
    text_only = re.sub(r'\*\[View full size: slides/.*?\]\(slides/.*?\)\*\n', '', text_only)

    # Remove horizontal rules that were added for slide separation
    text_only = re.sub(r'^---\n', '', text_only, flags=re.MULTILINE)

    # Clean up extra newlines
    text_only = re.sub(r'\n{3,}', '\n\n', text_only)

    # Apply final text cleaning to ensure plaintext
    text_only = clean_markdown_text(text_only)

    return text_only.strip()


def send_to_api_and_get_haste_link(markdown_content, document_title):
    """Send markdown to API and get both raw markdown and HTML URLs"""
    try:
        print("Sending to API for URLs...")

        # Create text-only version for API
        text_only_markdown = create_text_only_markdown(markdown_content)

        # First, send raw markdown to haste.nixc.us
        raw_haste_url = None
        try:
            print("  📝 Creating raw markdown URL...")
            raw_response = requests.post(
                "https://haste.nixc.us/documents",
                data=text_only_markdown.encode('utf-8'),
                headers={"Content-Type": "text/plain"},
                timeout=30
            )

            if raw_response.status_code == 200:
                raw_token = raw_response.text.strip().strip('"')
                # Extract just the token from JSON response if needed
                if raw_token.startswith('{"key":"') and raw_token.endswith('"}'):
                    import json
                    try:
                        token_data = json.loads(raw_token)
                        raw_token = token_data['key']
                    except:
                        pass
                raw_haste_url = f"https://haste.nixc.us/{raw_token}"
                print(f"  ✅ Raw markdown URL created")
            else:
                print(f"  ⚠️  Raw markdown upload failed with status {raw_response.status_code}")
        except Exception as e:
            print(f"  ⚠️  Failed to create raw markdown URL: {e}")

        # Then, send to md.colinknapp.com for HTML version
        html_url = None
        try:
            print("  🎨 Creating HTML version URL...")
            api_data = {
                "markdown": text_only_markdown,
                "format": "html",
                "template": "playful",
                "title": f"Pitch Deck Analysis: {document_title}",
                "subtitle": "AI-Generated Analysis with Agentic Insights",
                "contact": "Generated by Pitch Deck Parser",
                "send_to_haste": True
            }

            response = requests.post(
                "https://md.colinknapp.com/api/convert",
                headers={"Content-Type": "application/json"},
                data=json.dumps(api_data),
                timeout=30
            )

            if response.status_code == 200:
                result = response.json()
                if 'haste_url' in result:
                    # Extract token from haste_url and format as requested
                    haste_url = result['haste_url']
                    if 'haste.nixc.us/' in haste_url:
                        token = haste_url.split('haste.nixc.us/')[-1]
                        html_url = f"https://md.colinknapp.com/haste/{token}"
                    else:
                        html_url = haste_url
                    print(f"  ✅ HTML version URL created")
                else:
                    print("  ⚠️  API response missing haste_url")
            else:
                print(f"  ⚠️  HTML API request failed with status {response.status_code}")
        except Exception as e:
            print(f"  ⚠️  Failed to create HTML URL: {e}")

        return raw_haste_url, html_url

    except Exception as e:
        print(f"⚠️  Failed to send to API: {e}")
        return None, None