ploughshares/docker/crawler/check.py

26 lines
856 B
Python

# a quick little side script to look into the results
# NOT used for main workflow
import asyncio
from playwright.async_api import async_playwright
import json
import os
from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
# check how many pages are invalid password pages (it was not many -- like 7/100)
with open("crawl_results/successful_pages.json", "r") as f:
results = json.load(f)
counter = 0
total = 0
for result in results:
total += 1
if not "password is invalid" in result['content']:
print("\n\n\n FOUND: \n", result['content'])
counter+= 1
print(f"\n\n\n FINAL GOOD: {counter} OF {total} RESULTS")