From 5700893b8864f18b9f1a1abac452d3b32bc2c519 Mon Sep 17 00:00:00 2001 From: jChenvan <188939308+jChenvan@users.noreply.github.com> Date: Sat, 6 Sep 2025 10:23:51 -0400 Subject: [PATCH] Log raw content, Add sources table if not exists --- docker/crawler-google-alerts/.gitignore | 1 + docker/crawler-google-alerts/get_all_feed_contents.py | 8 ++++++-- docker/ploughshares/app.py | 8 ++++++++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/docker/crawler-google-alerts/.gitignore b/docker/crawler-google-alerts/.gitignore index 9b97969..a43a641 100644 --- a/docker/crawler-google-alerts/.gitignore +++ b/docker/crawler-google-alerts/.gitignore @@ -2,4 +2,5 @@ feeds.csv feed_contents.xml page_content.json +logs.json __pycache__/ \ No newline at end of file diff --git a/docker/crawler-google-alerts/get_all_feed_contents.py b/docker/crawler-google-alerts/get_all_feed_contents.py index a3763b8..eb72fdb 100644 --- a/docker/crawler-google-alerts/get_all_feed_contents.py +++ b/docker/crawler-google-alerts/get_all_feed_contents.py @@ -1,6 +1,7 @@ import asyncio import csv from dataclasses import dataclass +import json import os from typing import Dict, List, Tuple import feedparser @@ -135,7 +136,6 @@ async def fetch_site(url: str) -> str | None: main_text = clean_string(main_text) print(f"SUCCESSFUL FETCH: {url}") - print(f"FETCH CONTENT: {main_text[:140]}...") # .get_text() with separator and strip for cleaner output return main_text else: @@ -145,7 +145,6 @@ async def fetch_site(url: str) -> str | None: body_text = soup.body.get_text(separator='\n', strip=True) body_text = clean_string(body_text) print(f"SUCCESSFUL FETCH: {url}") - print(f"FETCH CONTENT: {body_text[:140]}...") return body_text except Exception as e: @@ -192,4 +191,9 @@ async def get_all_feed_contents() -> List[Dict[str, str]]: }) print(f"\nSuccessfully fetched {len(pages)} webpages.") + with open("logs.json", "w") as f: + json.dump({ + "urls":urls, + "results": results + }, f, indent = 4) return pages \ No newline at end of file diff --git a/docker/ploughshares/app.py b/docker/ploughshares/app.py index a40dd84..884592c 100644 --- a/docker/ploughshares/app.py +++ b/docker/ploughshares/app.py @@ -979,6 +979,14 @@ def view_sources(): try: with conn.cursor() as cur: + cur.execute(''' + CREATE TABLE IF NOT EXISTS sources ( + src_id SERIAL PRIMARY KEY, + title VARCHAR(255) NOT NULL, + link VARCHAR(255) NOT NULL, + type VARCHAR(255) NOT NULL + ) + ''') cur.execute('SELECT * FROM sources ORDER BY src_id DESC') sources = cur.fetchall() except Exception as e: