feat: default-on JSON reports and exports; set export-dir default to exports

This commit is contained in:
colin 2025-08-31 11:58:09 -04:00
parent a3d277488f
commit bbb7808d1f
1 changed files with 155 additions and 0 deletions

155
main.go
View File

@ -2,12 +2,15 @@ package main
import (
"context"
"encoding/csv"
"encoding/json"
"errors"
"flag"
"fmt"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"sync/atomic"
"time"
@ -28,6 +31,7 @@ func main() {
var sameHostOnly bool
var output string
var quiet bool
var exportDir string
flag.StringVar(&target, "target", "", "Target site URL (e.g., https://example.com)")
flag.IntVar(&concurrency, "concurrency", 10, "Number of concurrent workers")
@ -37,6 +41,7 @@ func main() {
flag.BoolVar(&sameHostOnly, "same-host-only", true, "Limit crawl to the same host as target")
flag.StringVar(&output, "output", "text", "Output format: text|json")
flag.BoolVar(&quiet, "quiet", false, "Suppress progress output")
flag.StringVar(&exportDir, "export-dir", "exports", "Directory to write CSV/NDJSON exports into (set empty to disable)")
flag.Parse()
if strings.TrimSpace(target) == "" {
@ -184,6 +189,17 @@ func main() {
reports := report.Build(target, visited, smURLs, crawlErrs, checkResults, outlinks, meta, params, pages, robots)
if exportDir != "" {
if err := exportAll(exportDir, reports); err != nil {
fmt.Fprintf(os.Stderr, "export error: %v\n", err)
}
}
// Save JSON report to ./reports/<host>.json by default (ignored by git)
if err := saveReportJSON("reports", reports); err != nil {
fmt.Fprintf(os.Stderr, "save report error: %v\n", err)
}
switch output {
case "json":
enc := json.NewEncoder(os.Stdout)
@ -204,3 +220,142 @@ func truncateForTTY(s string, max int) string {
}
return s[:max-1] + "…"
}
func exportAll(baseDir string, r report.Report) error {
u, err := url.Parse(r.Target)
if err != nil || u.Host == "" {
return fmt.Errorf("invalid target for export: %s", r.Target)
}
dir := filepath.Join(baseDir, u.Host)
if err := os.MkdirAll(dir, 0o755); err != nil {
return err
}
if err := exportCSVPages(filepath.Join(dir, "pages.csv"), r); err != nil {
return err
}
if err := exportCSVLinks(filepath.Join(dir, "links.csv"), r); err != nil {
return err
}
if err := exportNDJSON(filepath.Join(dir, "pages.ndjson"), pagesToNDJSON(r)); err != nil {
return err
}
if err := exportNDJSON(filepath.Join(dir, "links.ndjson"), linksToNDJSON(r)); err != nil {
return err
}
if err := exportNDJSON(filepath.Join(dir, "link_statuses.ndjson"), linkStatusesToNDJSON(r)); err != nil {
return err
}
return nil
}
func exportCSVPages(path string, r report.Report) error {
f, err := os.Create(path)
if err != nil {
return err
}
defer f.Close()
w := csv.NewWriter(f)
defer w.Flush()
_ = w.Write([]string{"url", "title", "responseTimeMs", "contentLength", "depth"})
for u, pm := range r.Pages {
rec := []string{u, pm.Title, fmt.Sprintf("%d", pm.ResponseTimeMs), fmt.Sprintf("%d", pm.ContentLength), fmt.Sprintf("%d", pm.Depth)}
_ = w.Write(rec)
}
return w.Error()
}
func exportCSVLinks(path string, r report.Report) error {
f, err := os.Create(path)
if err != nil {
return err
}
defer f.Close()
w := csv.NewWriter(f)
defer w.Flush()
_ = w.Write([]string{"sourceUrl", "targetUrl"})
for src, lst := range r.PageOutlinks {
for _, dst := range lst {
_ = w.Write([]string{src, dst})
}
}
return w.Error()
}
type ndjsonItem interface{}
func exportNDJSON(path string, items []ndjsonItem) error {
f, err := os.Create(path)
if err != nil {
return err
}
defer f.Close()
enc := json.NewEncoder(f)
for _, it := range items {
if err := enc.Encode(it); err != nil {
return err
}
}
return nil
}
func pagesToNDJSON(r report.Report) []ndjsonItem {
res := make([]ndjsonItem, 0, len(r.Pages))
for u, pm := range r.Pages {
res = append(res, map[string]any{
"type": "page",
"url": u,
"title": pm.Title,
"responseTimeMs": pm.ResponseTimeMs,
"contentLength": pm.ContentLength,
"depth": pm.Depth,
})
}
return res
}
func linksToNDJSON(r report.Report) []ndjsonItem {
var res []ndjsonItem
for src, lst := range r.PageOutlinks {
for _, dst := range lst {
res = append(res, map[string]any{
"type": "link",
"src": src,
"dest": dst,
})
}
}
return res
}
func linkStatusesToNDJSON(r report.Report) []ndjsonItem {
res := make([]ndjsonItem, 0, len(r.LinkStatuses))
for _, ls := range r.LinkStatuses {
res = append(res, map[string]any{
"type": "link_status",
"url": ls.URL,
"statusCode": ls.StatusCode,
"ok": ls.OK,
"error": ls.Err,
})
}
return res
}
func saveReportJSON(baseDir string, r report.Report) error {
u, err := url.Parse(r.Target)
if err != nil || u.Host == "" {
return fmt.Errorf("invalid target for save: %s", r.Target)
}
if err := os.MkdirAll(baseDir, 0o755); err != nil {
return err
}
path := filepath.Join(baseDir, u.Host+".json")
f, err := os.Create(path)
if err != nil {
return err
}
defer f.Close()
enc := json.NewEncoder(f)
enc.SetIndent("", " ")
return enc.Encode(r)
}