Log raw content, Add sources table if not exists
This commit is contained in:
parent
431d235e3b
commit
5700893b88
|
@ -2,4 +2,5 @@
|
||||||
feeds.csv
|
feeds.csv
|
||||||
feed_contents.xml
|
feed_contents.xml
|
||||||
page_content.json
|
page_content.json
|
||||||
|
logs.json
|
||||||
__pycache__/
|
__pycache__/
|
|
@ -1,6 +1,7 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import csv
|
import csv
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
from typing import Dict, List, Tuple
|
from typing import Dict, List, Tuple
|
||||||
import feedparser
|
import feedparser
|
||||||
|
@ -135,7 +136,6 @@ async def fetch_site(url: str) -> str | None:
|
||||||
main_text = clean_string(main_text)
|
main_text = clean_string(main_text)
|
||||||
|
|
||||||
print(f"SUCCESSFUL FETCH: {url}")
|
print(f"SUCCESSFUL FETCH: {url}")
|
||||||
print(f"FETCH CONTENT: {main_text[:140]}...")
|
|
||||||
# .get_text() with separator and strip for cleaner output
|
# .get_text() with separator and strip for cleaner output
|
||||||
return main_text
|
return main_text
|
||||||
else:
|
else:
|
||||||
|
@ -145,7 +145,6 @@ async def fetch_site(url: str) -> str | None:
|
||||||
body_text = soup.body.get_text(separator='\n', strip=True)
|
body_text = soup.body.get_text(separator='\n', strip=True)
|
||||||
body_text = clean_string(body_text)
|
body_text = clean_string(body_text)
|
||||||
print(f"SUCCESSFUL FETCH: {url}")
|
print(f"SUCCESSFUL FETCH: {url}")
|
||||||
print(f"FETCH CONTENT: {body_text[:140]}...")
|
|
||||||
return body_text
|
return body_text
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -192,4 +191,9 @@ async def get_all_feed_contents() -> List[Dict[str, str]]:
|
||||||
})
|
})
|
||||||
|
|
||||||
print(f"\nSuccessfully fetched {len(pages)} webpages.")
|
print(f"\nSuccessfully fetched {len(pages)} webpages.")
|
||||||
|
with open("logs.json", "w") as f:
|
||||||
|
json.dump({
|
||||||
|
"urls":urls,
|
||||||
|
"results": results
|
||||||
|
}, f, indent = 4)
|
||||||
return pages
|
return pages
|
|
@ -979,6 +979,14 @@ def view_sources():
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
|
cur.execute('''
|
||||||
|
CREATE TABLE IF NOT EXISTS sources (
|
||||||
|
src_id SERIAL PRIMARY KEY,
|
||||||
|
title VARCHAR(255) NOT NULL,
|
||||||
|
link VARCHAR(255) NOT NULL,
|
||||||
|
type VARCHAR(255) NOT NULL
|
||||||
|
)
|
||||||
|
''')
|
||||||
cur.execute('SELECT * FROM sources ORDER BY src_id DESC')
|
cur.execute('SELECT * FROM sources ORDER BY src_id DESC')
|
||||||
sources = cur.fetchall()
|
sources = cur.fetchall()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
Loading…
Reference in New Issue