26 lines
856 B
Python
26 lines
856 B
Python
# a quick little side script to look into the results
|
|
# NOT used for main workflow
|
|
|
|
import asyncio
|
|
from playwright.async_api import async_playwright
|
|
import json
|
|
import os
|
|
from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig
|
|
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
|
|
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
|
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
|
|
|
|
|
|
# check how many pages are invalid password pages (it was not many -- like 7/100)
|
|
with open("crawl_results/successful_pages.json", "r") as f:
|
|
results = json.load(f)
|
|
|
|
counter = 0
|
|
total = 0
|
|
for result in results:
|
|
total += 1
|
|
if not "password is invalid" in result['content']:
|
|
print("\n\n\n FOUND: \n", result['content'])
|
|
counter+= 1
|
|
|
|
print(f"\n\n\n FINAL GOOD: {counter} OF {total} RESULTS") |