#!/usr/bin/env python3 import re import requests import json def clean_markdown_text(text): """Clean markdown text to ensure it's plaintext with no special characters""" if not text: return "" # Remove LaTeX commands and math expressions text = re.sub(r'\\[a-zA-Z]+\{[^}]*\}', '', text) # Remove \command{content} text = re.sub(r'\$[^$]*\$', '', text) # Remove $math$ expressions text = re.sub(r'\\[a-zA-Z]+', '', text) # Remove remaining \commands # Remove markdown formatting but keep the text text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text) # Remove bold **text** text = re.sub(r'\*([^*]+)\*', r'\1', text) # Remove italic *text* text = re.sub(r'`([^`]+)`', r'\1', text) # Remove code `text` text = re.sub(r'#{1,6}\s*', '', text) # Remove headers # ## ### # Remove special characters but keep basic punctuation text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\"\'\/\&\%\@\#\$\+\=\<\>]', ' ', text) # Clean up multiple spaces and newlines text = re.sub(r'\s+', ' ', text) text = re.sub(r'\n\s*\n', '\n\n', text) return text.strip() def create_slide_markdown(slide_data, analysis_results, slide_num, slide_text=""): """Create markdown content for a single slide with all agentic analyses and text content""" markdown = f"""# Slide {slide_num} ![Slide {slide_num}](slides/{slide_data['filename']}) """ # Add text content if available if slide_text and slide_text.strip(): # Clean the slide text to ensure it's plaintext clean_slide_text = clean_markdown_text(slide_text) markdown += f"""## Text Content {clean_slide_text} """ markdown += """## Agentic Analysis """ for prompt_key, result in analysis_results.items(): # Clean the analysis text to ensure it's plaintext clean_analysis = clean_markdown_text(result['analysis']) markdown += f"""### {result['agent']} {clean_analysis} """ markdown += "---\n\n" return markdown def create_text_only_markdown(markdown_content): """Create a text-only version of markdown without image references for API submission""" # Remove image markdown blocks but keep the text descriptions and analysis text_only = markdown_content # Remove image embedding lines text_only = re.sub(r'!\[.*?\]\(slides/.*?\)\n', '', text_only) # Remove image link lines text_only = re.sub(r'\*\[View full size: slides/.*?\]\(slides/.*?\)\*\n', '', text_only) # Remove horizontal rules that were added for slide separation text_only = re.sub(r'^---\n', '', text_only, flags=re.MULTILINE) # Clean up extra newlines text_only = re.sub(r'\n{3,}', '\n\n', text_only) # Apply final text cleaning to ensure plaintext text_only = clean_markdown_text(text_only) return text_only.strip() def send_to_api_and_get_haste_link(markdown_content, document_title): """Send markdown to API and get both raw markdown and HTML URLs""" try: print("Sending to API for URLs...") # Create text-only version for API text_only_markdown = create_text_only_markdown(markdown_content) # First, send raw markdown to haste.nixc.us raw_haste_url = None try: print(" 📝 Creating raw markdown URL...") raw_response = requests.post( "https://haste.nixc.us/documents", data=text_only_markdown.encode('utf-8'), headers={"Content-Type": "text/plain"}, timeout=30 ) if raw_response.status_code == 200: raw_token = raw_response.text.strip().strip('"') # Extract just the token from JSON response if needed if raw_token.startswith('{"key":"') and raw_token.endswith('"}'): import json try: token_data = json.loads(raw_token) raw_token = token_data['key'] except: pass raw_haste_url = f"https://haste.nixc.us/{raw_token}" print(f" ✅ Raw markdown URL created") else: print(f" ⚠️ Raw markdown upload failed with status {raw_response.status_code}") except Exception as e: print(f" ⚠️ Failed to create raw markdown URL: {e}") # Then, send to md.colinknapp.com for HTML version html_url = None try: print(" 🎨 Creating HTML version URL...") api_data = { "markdown": text_only_markdown, "format": "html", "template": "playful", "title": f"Pitch Deck Analysis: {document_title}", "subtitle": "AI-Generated Analysis with Agentic Insights", "contact": "Generated by Pitch Deck Parser", "send_to_haste": True } response = requests.post( "https://md.colinknapp.com/api/convert", headers={"Content-Type": "application/json"}, data=json.dumps(api_data), timeout=30 ) if response.status_code == 200: result = response.json() if 'haste_url' in result: # Extract token from haste_url and format as requested haste_url = result['haste_url'] if 'haste.nixc.us/' in haste_url: token = haste_url.split('haste.nixc.us/')[-1] html_url = f"https://md.colinknapp.com/haste/{token}" else: html_url = haste_url print(f" ✅ HTML version URL created") else: print(" ⚠️ API response missing haste_url") else: print(f" ⚠️ HTML API request failed with status {response.status_code}") except Exception as e: print(f" ⚠️ Failed to create HTML URL: {e}") return raw_haste_url, html_url except Exception as e: print(f"⚠️ Failed to send to API: {e}") return None, None