technical-screen-2025-10-22/modules/markdown_utils.py

174 lines
6.2 KiB
Python

#!/usr/bin/env python3
import re
import requests
import json
def clean_markdown_text(text):
"""Clean markdown text to ensure it's plaintext with no special characters"""
if not text:
return ""
# Remove LaTeX commands and math expressions
text = re.sub(r'\\[a-zA-Z]+\{[^}]*\}', '', text) # Remove \command{content}
text = re.sub(r'\$[^$]*\$', '', text) # Remove $math$ expressions
text = re.sub(r'\\[a-zA-Z]+', '', text) # Remove remaining \commands
# Remove markdown formatting but keep the text
text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text) # Remove bold **text**
text = re.sub(r'\*([^*]+)\*', r'\1', text) # Remove italic *text*
text = re.sub(r'`([^`]+)`', r'\1', text) # Remove code `text`
text = re.sub(r'#{1,6}\s*', '', text) # Remove headers # ## ###
# Remove special characters but keep basic punctuation
text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\"\'\/\&\%\@\#\$\+\=\<\>]', ' ', text)
# Clean up multiple spaces and newlines
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'\n\s*\n', '\n\n', text)
return text.strip()
def create_slide_markdown(slide_data, analysis_results, slide_num, slide_text=""):
"""Create markdown content for a single slide with all agentic analyses and text content"""
markdown = f"""# Slide {slide_num}
![Slide {slide_num}](slides/{slide_data['filename']})
"""
# Add text content if available
if slide_text and slide_text.strip():
# Clean the slide text to ensure it's plaintext
clean_slide_text = clean_markdown_text(slide_text)
markdown += f"""## Text Content
{clean_slide_text}
"""
markdown += """## Agentic Analysis
"""
for prompt_key, result in analysis_results.items():
# Clean the analysis text to ensure it's plaintext
clean_analysis = clean_markdown_text(result['analysis'])
markdown += f"""### {result['agent']}
{clean_analysis}
"""
markdown += "---\n\n"
return markdown
def create_text_only_markdown(markdown_content):
"""Create a text-only version of markdown without image references for API submission"""
# Remove image markdown blocks but keep the text descriptions and analysis
text_only = markdown_content
# Remove image embedding lines
text_only = re.sub(r'!\[.*?\]\(slides/.*?\)\n', '', text_only)
# Remove image link lines
text_only = re.sub(r'\*\[View full size: slides/.*?\]\(slides/.*?\)\*\n', '', text_only)
# Remove horizontal rules that were added for slide separation
text_only = re.sub(r'^---\n', '', text_only, flags=re.MULTILINE)
# Clean up extra newlines
text_only = re.sub(r'\n{3,}', '\n\n', text_only)
# Apply final text cleaning to ensure plaintext
text_only = clean_markdown_text(text_only)
return text_only.strip()
def send_to_api_and_get_haste_link(markdown_content, document_title):
"""Send markdown to API and get both raw markdown and HTML URLs"""
try:
print("Sending to API for URLs...")
# Create text-only version for API
text_only_markdown = create_text_only_markdown(markdown_content)
# First, send raw markdown to haste.nixc.us
raw_haste_url = None
try:
print(" 📝 Creating raw markdown URL...")
raw_response = requests.post(
"https://haste.nixc.us/documents",
data=text_only_markdown.encode('utf-8'),
headers={"Content-Type": "text/plain"},
timeout=30
)
if raw_response.status_code == 200:
raw_token = raw_response.text.strip().strip('"')
# Extract just the token from JSON response if needed
if raw_token.startswith('{"key":"') and raw_token.endswith('"}'):
import json
try:
token_data = json.loads(raw_token)
raw_token = token_data['key']
except:
pass
raw_haste_url = f"https://haste.nixc.us/{raw_token}"
print(f" ✅ Raw markdown URL created")
else:
print(f" ⚠️ Raw markdown upload failed with status {raw_response.status_code}")
except Exception as e:
print(f" ⚠️ Failed to create raw markdown URL: {e}")
# Then, send to md.colinknapp.com for HTML version
html_url = None
try:
print(" 🎨 Creating HTML version URL...")
api_data = {
"markdown": text_only_markdown,
"format": "html",
"template": "playful",
"title": f"Pitch Deck Analysis: {document_title}",
"subtitle": "AI-Generated Analysis with Agentic Insights",
"contact": "Generated by Pitch Deck Parser",
"send_to_haste": True
}
response = requests.post(
"https://md.colinknapp.com/api/convert",
headers={"Content-Type": "application/json"},
data=json.dumps(api_data),
timeout=30
)
if response.status_code == 200:
result = response.json()
if 'haste_url' in result:
# Extract token from haste_url and format as requested
haste_url = result['haste_url']
if 'haste.nixc.us/' in haste_url:
token = haste_url.split('haste.nixc.us/')[-1]
html_url = f"https://md.colinknapp.com/haste/{token}"
else:
html_url = haste_url
print(f" ✅ HTML version URL created")
else:
print(" ⚠️ API response missing haste_url")
else:
print(f" ⚠️ HTML API request failed with status {response.status_code}")
except Exception as e:
print(f" ⚠️ Failed to create HTML URL: {e}")
return raw_haste_url, html_url
except Exception as e:
print(f"⚠️ Failed to send to API: {e}")
return None, None