package main import ( "context" "encoding/csv" "encoding/json" "errors" "flag" "fmt" "net/http" "net/url" "os" "path/filepath" "strings" "sync/atomic" "time" "urlcrawler/internal/crawler" "urlcrawler/internal/linkcheck" "urlcrawler/internal/report" "urlcrawler/internal/sitemap" "urlcrawler/internal/urlutil" ) func main() { var target string var concurrency int var timeout time.Duration var maxDepth int var userAgent string var sameHostOnly bool var output string var quiet bool var exportDir string flag.StringVar(&target, "target", "", "Target site URL (e.g., https://example.com)") flag.IntVar(&concurrency, "concurrency", 10, "Number of concurrent workers") flag.DurationVar(&timeout, "timeout", 10*time.Second, "HTTP timeout per request") flag.IntVar(&maxDepth, "max-depth", 2, "Maximum crawl depth (0=crawl only the start page)") flag.StringVar(&userAgent, "user-agent", "urlcrawler/1.0", "User-Agent header value") flag.BoolVar(&sameHostOnly, "same-host-only", true, "Limit crawl to the same host as target") flag.StringVar(&output, "output", "text", "Output format: text|json") flag.BoolVar(&quiet, "quiet", false, "Suppress progress output") flag.StringVar(&exportDir, "export-dir", "exports", "Directory to write CSV/NDJSON exports into (set empty to disable)") flag.Parse() if strings.TrimSpace(target) == "" { fmt.Fprintln(os.Stderr, "-target is required") flag.Usage() os.Exit(2) } client := &http.Client{Timeout: timeout} ctx := context.Background() // Report metadata started := time.Now() meta := report.Metadata{StartedAt: started.UTC().Format(time.RFC3339)} params := report.Params{ MaxDepth: maxDepth, Concurrency: concurrency, TimeoutMs: timeout.Milliseconds(), UserAgent: userAgent, SameHostOnly: sameHostOnly, } fmt.Fprintf(os.Stderr, "Starting crawl of %s (depth: %d)...\n", target, maxDepth) // Setup progress counters var urlsVisited, urlsErrored atomic.Int64 var currentURL atomic.Value // string var pendingTasks atomic.Int64 // Start progress reporter if not in quiet mode ctxWithCancel, cancel := context.WithCancel(ctx) defer cancel() if !quiet { go func() { ticker := time.NewTicker(500 * time.Millisecond) defer ticker.Stop() for { select { case <-ticker.C: cu, _ := currentURL.Load().(string) fmt.Fprintf(os.Stderr, "\rURLs visited: %d | Errors: %d | Pending: %d | Current: %s", urlsVisited.Load(), urlsErrored.Load(), pendingTasks.Load(), truncateForTTY(cu, 90)) case <-ctxWithCancel.Done(): return } } }() } // Progress callback functions visitedCallback := func(u string, depth int, pending int) { urlsVisited.Add(1) pendingTasks.Store(int64(pending)) currentURL.Store(u) } errorCallback := func(u string, err error, pending int) { urlsErrored.Add(1) pendingTasks.Store(int64(pending)) currentURL.Store(u) } visited, crawlErrs, outlinks, pageInfo := crawler.Crawl(ctx, target, maxDepth, concurrency, sameHostOnly, client, userAgent, visitedCallback, errorCallback) // Clear progress line before moving to next phase if !quiet { fmt.Fprintf(os.Stderr, "\rCrawl complete! URLs visited: %d | Errors: %d\n", urlsVisited.Load(), urlsErrored.Load()) } fmt.Fprintf(os.Stderr, "Fetching sitemap...\n") smURLs, err := sitemap.FetchAll(ctx, target, client, userAgent) if err != nil && !errors.Is(err, sitemap.ErrNotFound) { fmt.Fprintf(os.Stderr, "sitemap error: %v\n", err) } // Robots.txt summary (simple) robots := report.RobotsSummary{} robotsURL := urlutil.Origin(target) + "/robots.txt" { req, _ := http.NewRequestWithContext(ctx, http.MethodGet, robotsURL, nil) req.Header.Set("User-Agent", userAgent) resp, err := client.Do(req) if err == nil { defer resp.Body.Close() if resp.StatusCode == http.StatusOK { robots.Present = true robots.FetchedAt = time.Now().UTC().Format(time.RFC3339) } } } // Build set of all unique links discovered across pages for status checks allLinks := make(map[string]struct{}) for _, m := range outlinks { for u := range m { allLinks[u] = struct{}{} } } // Also include the visited pages themselves for u := range visited { allLinks[u] = struct{}{} } fmt.Fprintf(os.Stderr, "Checking %d links...\n", len(allLinks)) // Reset counters for link checking urlsVisited.Store(0) urlsErrored.Store(0) // Progress callback functions for link checking linkCheckCallback := func(ok bool) { if ok { urlsVisited.Add(1) } else { urlsErrored.Add(1) } } checkResults := linkcheck.Check(ctx, allLinks, concurrency, client, userAgent, !quiet, linkCheckCallback) // Clear progress line before finishing if !quiet { fmt.Fprintf(os.Stderr, "\rLink checking complete! OK: %d | Errors: %d\n", urlsVisited.Load(), urlsErrored.Load()) } finished := time.Now() meta.FinishedAt = finished.UTC().Format(time.RFC3339) meta.DurationMs = finished.Sub(started).Milliseconds() fmt.Fprintf(os.Stderr, "Building report...\n") // Convert pageInfo to report.PageMeta pages := make(map[string]report.PageMeta, len(pageInfo)) for u, pi := range pageInfo { pages[u] = report.PageMeta{ Title: pi.Title, ResponseTimeMs: pi.ResponseTimeMs, ContentLength: pi.ContentLength, Depth: pi.Depth, } } reports := report.Build(target, visited, smURLs, crawlErrs, checkResults, outlinks, meta, params, pages, robots) if exportDir != "" { if err := exportAll(exportDir, reports); err != nil { fmt.Fprintf(os.Stderr, "export error: %v\n", err) } } // Save JSON report to ./reports/.json by default (ignored by git) if err := saveReportJSON("reports", reports); err != nil { fmt.Fprintf(os.Stderr, "save report error: %v\n", err) } switch output { case "json": enc := json.NewEncoder(os.Stdout) enc.SetIndent("", " ") _ = enc.Encode(reports) default: report.PrintText(os.Stdout, reports) } } // truncateForTTY truncates s to max characters, replacing the tail with … if needed. func truncateForTTY(s string, max int) string { if max <= 0 || len(s) <= max { return s } if max <= 1 { return "…" } return s[:max-1] + "…" } func exportAll(baseDir string, r report.Report) error { u, err := url.Parse(r.Target) if err != nil || u.Host == "" { return fmt.Errorf("invalid target for export: %s", r.Target) } dir := filepath.Join(baseDir, u.Host) if err := os.MkdirAll(dir, 0o755); err != nil { return err } if err := exportCSVPages(filepath.Join(dir, "pages.csv"), r); err != nil { return err } if err := exportCSVLinks(filepath.Join(dir, "links.csv"), r); err != nil { return err } if err := exportNDJSON(filepath.Join(dir, "pages.ndjson"), pagesToNDJSON(r)); err != nil { return err } if err := exportNDJSON(filepath.Join(dir, "links.ndjson"), linksToNDJSON(r)); err != nil { return err } if err := exportNDJSON(filepath.Join(dir, "link_statuses.ndjson"), linkStatusesToNDJSON(r)); err != nil { return err } return nil } func exportCSVPages(path string, r report.Report) error { f, err := os.Create(path) if err != nil { return err } defer f.Close() w := csv.NewWriter(f) defer w.Flush() _ = w.Write([]string{"url", "title", "responseTimeMs", "contentLength", "depth"}) for u, pm := range r.Pages { rec := []string{u, pm.Title, fmt.Sprintf("%d", pm.ResponseTimeMs), fmt.Sprintf("%d", pm.ContentLength), fmt.Sprintf("%d", pm.Depth)} _ = w.Write(rec) } return w.Error() } func exportCSVLinks(path string, r report.Report) error { f, err := os.Create(path) if err != nil { return err } defer f.Close() w := csv.NewWriter(f) defer w.Flush() _ = w.Write([]string{"sourceUrl", "targetUrl"}) for src, lst := range r.PageOutlinks { for _, dst := range lst { _ = w.Write([]string{src, dst}) } } return w.Error() } type ndjsonItem interface{} func exportNDJSON(path string, items []ndjsonItem) error { f, err := os.Create(path) if err != nil { return err } defer f.Close() enc := json.NewEncoder(f) for _, it := range items { if err := enc.Encode(it); err != nil { return err } } return nil } func pagesToNDJSON(r report.Report) []ndjsonItem { res := make([]ndjsonItem, 0, len(r.Pages)) for u, pm := range r.Pages { res = append(res, map[string]any{ "type": "page", "url": u, "title": pm.Title, "responseTimeMs": pm.ResponseTimeMs, "contentLength": pm.ContentLength, "depth": pm.Depth, }) } return res } func linksToNDJSON(r report.Report) []ndjsonItem { var res []ndjsonItem for src, lst := range r.PageOutlinks { for _, dst := range lst { res = append(res, map[string]any{ "type": "link", "src": src, "dest": dst, }) } } return res } func linkStatusesToNDJSON(r report.Report) []ndjsonItem { res := make([]ndjsonItem, 0, len(r.LinkStatuses)) for _, ls := range r.LinkStatuses { res = append(res, map[string]any{ "type": "link_status", "url": ls.URL, "statusCode": ls.StatusCode, "ok": ls.OK, "error": ls.Err, }) } return res } func saveReportJSON(baseDir string, r report.Report) error { u, err := url.Parse(r.Target) if err != nil || u.Host == "" { return fmt.Errorf("invalid target for save: %s", r.Target) } if err := os.MkdirAll(baseDir, 0o755); err != nil { return err } path := filepath.Join(baseDir, u.Host+".json") f, err := os.Create(path) if err != nil { return err } defer f.Close() enc := json.NewEncoder(f) enc.SetIndent("", " ") return enc.Encode(r) }