From bbb7808d1f25d15e02299c2e7e4c585bfcd59c51 Mon Sep 17 00:00:00 2001 From: colin Date: Sun, 31 Aug 2025 11:58:09 -0400 Subject: [PATCH] feat: default-on JSON reports and exports; set export-dir default to exports --- main.go | 155 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) diff --git a/main.go b/main.go index e148180..ccf6193 100644 --- a/main.go +++ b/main.go @@ -2,12 +2,15 @@ package main import ( "context" + "encoding/csv" "encoding/json" "errors" "flag" "fmt" "net/http" + "net/url" "os" + "path/filepath" "strings" "sync/atomic" "time" @@ -28,6 +31,7 @@ func main() { var sameHostOnly bool var output string var quiet bool + var exportDir string flag.StringVar(&target, "target", "", "Target site URL (e.g., https://example.com)") flag.IntVar(&concurrency, "concurrency", 10, "Number of concurrent workers") @@ -37,6 +41,7 @@ func main() { flag.BoolVar(&sameHostOnly, "same-host-only", true, "Limit crawl to the same host as target") flag.StringVar(&output, "output", "text", "Output format: text|json") flag.BoolVar(&quiet, "quiet", false, "Suppress progress output") + flag.StringVar(&exportDir, "export-dir", "exports", "Directory to write CSV/NDJSON exports into (set empty to disable)") flag.Parse() if strings.TrimSpace(target) == "" { @@ -184,6 +189,17 @@ func main() { reports := report.Build(target, visited, smURLs, crawlErrs, checkResults, outlinks, meta, params, pages, robots) + if exportDir != "" { + if err := exportAll(exportDir, reports); err != nil { + fmt.Fprintf(os.Stderr, "export error: %v\n", err) + } + } + + // Save JSON report to ./reports/.json by default (ignored by git) + if err := saveReportJSON("reports", reports); err != nil { + fmt.Fprintf(os.Stderr, "save report error: %v\n", err) + } + switch output { case "json": enc := json.NewEncoder(os.Stdout) @@ -204,3 +220,142 @@ func truncateForTTY(s string, max int) string { } return s[:max-1] + "…" } + +func exportAll(baseDir string, r report.Report) error { + u, err := url.Parse(r.Target) + if err != nil || u.Host == "" { + return fmt.Errorf("invalid target for export: %s", r.Target) + } + dir := filepath.Join(baseDir, u.Host) + if err := os.MkdirAll(dir, 0o755); err != nil { + return err + } + if err := exportCSVPages(filepath.Join(dir, "pages.csv"), r); err != nil { + return err + } + if err := exportCSVLinks(filepath.Join(dir, "links.csv"), r); err != nil { + return err + } + if err := exportNDJSON(filepath.Join(dir, "pages.ndjson"), pagesToNDJSON(r)); err != nil { + return err + } + if err := exportNDJSON(filepath.Join(dir, "links.ndjson"), linksToNDJSON(r)); err != nil { + return err + } + if err := exportNDJSON(filepath.Join(dir, "link_statuses.ndjson"), linkStatusesToNDJSON(r)); err != nil { + return err + } + return nil +} + +func exportCSVPages(path string, r report.Report) error { + f, err := os.Create(path) + if err != nil { + return err + } + defer f.Close() + w := csv.NewWriter(f) + defer w.Flush() + _ = w.Write([]string{"url", "title", "responseTimeMs", "contentLength", "depth"}) + for u, pm := range r.Pages { + rec := []string{u, pm.Title, fmt.Sprintf("%d", pm.ResponseTimeMs), fmt.Sprintf("%d", pm.ContentLength), fmt.Sprintf("%d", pm.Depth)} + _ = w.Write(rec) + } + return w.Error() +} + +func exportCSVLinks(path string, r report.Report) error { + f, err := os.Create(path) + if err != nil { + return err + } + defer f.Close() + w := csv.NewWriter(f) + defer w.Flush() + _ = w.Write([]string{"sourceUrl", "targetUrl"}) + for src, lst := range r.PageOutlinks { + for _, dst := range lst { + _ = w.Write([]string{src, dst}) + } + } + return w.Error() +} + +type ndjsonItem interface{} + +func exportNDJSON(path string, items []ndjsonItem) error { + f, err := os.Create(path) + if err != nil { + return err + } + defer f.Close() + enc := json.NewEncoder(f) + for _, it := range items { + if err := enc.Encode(it); err != nil { + return err + } + } + return nil +} + +func pagesToNDJSON(r report.Report) []ndjsonItem { + res := make([]ndjsonItem, 0, len(r.Pages)) + for u, pm := range r.Pages { + res = append(res, map[string]any{ + "type": "page", + "url": u, + "title": pm.Title, + "responseTimeMs": pm.ResponseTimeMs, + "contentLength": pm.ContentLength, + "depth": pm.Depth, + }) + } + return res +} + +func linksToNDJSON(r report.Report) []ndjsonItem { + var res []ndjsonItem + for src, lst := range r.PageOutlinks { + for _, dst := range lst { + res = append(res, map[string]any{ + "type": "link", + "src": src, + "dest": dst, + }) + } + } + return res +} + +func linkStatusesToNDJSON(r report.Report) []ndjsonItem { + res := make([]ndjsonItem, 0, len(r.LinkStatuses)) + for _, ls := range r.LinkStatuses { + res = append(res, map[string]any{ + "type": "link_status", + "url": ls.URL, + "statusCode": ls.StatusCode, + "ok": ls.OK, + "error": ls.Err, + }) + } + return res +} + +func saveReportJSON(baseDir string, r report.Report) error { + u, err := url.Parse(r.Target) + if err != nil || u.Host == "" { + return fmt.Errorf("invalid target for save: %s", r.Target) + } + if err := os.MkdirAll(baseDir, 0o755); err != nil { + return err + } + path := filepath.Join(baseDir, u.Host+".json") + f, err := os.Create(path) + if err != nil { + return err + } + defer f.Close() + enc := json.NewEncoder(f) + enc.SetIndent("", " ") + return enc.Encode(r) +}