Log raw content, Add sources table if not exists

This commit is contained in:
jChenvan 2025-09-06 10:23:51 -04:00
parent 431d235e3b
commit 5700893b88
3 changed files with 15 additions and 2 deletions

View File

@ -2,4 +2,5 @@
feeds.csv
feed_contents.xml
page_content.json
logs.json
__pycache__/

View File

@ -1,6 +1,7 @@
import asyncio
import csv
from dataclasses import dataclass
import json
import os
from typing import Dict, List, Tuple
import feedparser
@ -135,7 +136,6 @@ async def fetch_site(url: str) -> str | None:
main_text = clean_string(main_text)
print(f"SUCCESSFUL FETCH: {url}")
print(f"FETCH CONTENT: {main_text[:140]}...")
# .get_text() with separator and strip for cleaner output
return main_text
else:
@ -145,7 +145,6 @@ async def fetch_site(url: str) -> str | None:
body_text = soup.body.get_text(separator='\n', strip=True)
body_text = clean_string(body_text)
print(f"SUCCESSFUL FETCH: {url}")
print(f"FETCH CONTENT: {body_text[:140]}...")
return body_text
except Exception as e:
@ -192,4 +191,9 @@ async def get_all_feed_contents() -> List[Dict[str, str]]:
})
print(f"\nSuccessfully fetched {len(pages)} webpages.")
with open("logs.json", "w") as f:
json.dump({
"urls":urls,
"results": results
}, f, indent = 4)
return pages

View File

@ -979,6 +979,14 @@ def view_sources():
try:
with conn.cursor() as cur:
cur.execute('''
CREATE TABLE IF NOT EXISTS sources (
src_id SERIAL PRIMARY KEY,
title VARCHAR(255) NOT NULL,
link VARCHAR(255) NOT NULL,
type VARCHAR(255) NOT NULL
)
''')
cur.execute('SELECT * FROM sources ORDER BY src_id DESC')
sources = cur.fetchall()
except Exception as e: