gosint-sitecrawl/main.go

362 lines
9.4 KiB
Go

package main
import (
"context"
"encoding/csv"
"encoding/json"
"errors"
"flag"
"fmt"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"sync/atomic"
"time"
"urlcrawler/internal/crawler"
"urlcrawler/internal/linkcheck"
"urlcrawler/internal/report"
"urlcrawler/internal/sitemap"
"urlcrawler/internal/urlutil"
)
func main() {
var target string
var concurrency int
var timeout time.Duration
var maxDepth int
var userAgent string
var sameHostOnly bool
var output string
var quiet bool
var exportDir string
flag.StringVar(&target, "target", "", "Target site URL (e.g., https://example.com)")
flag.IntVar(&concurrency, "concurrency", 10, "Number of concurrent workers")
flag.DurationVar(&timeout, "timeout", 10*time.Second, "HTTP timeout per request")
flag.IntVar(&maxDepth, "max-depth", 2, "Maximum crawl depth (0=crawl only the start page)")
flag.StringVar(&userAgent, "user-agent", "urlcrawler/1.0", "User-Agent header value")
flag.BoolVar(&sameHostOnly, "same-host-only", true, "Limit crawl to the same host as target")
flag.StringVar(&output, "output", "text", "Output format: text|json")
flag.BoolVar(&quiet, "quiet", false, "Suppress progress output")
flag.StringVar(&exportDir, "export-dir", "exports", "Directory to write CSV/NDJSON exports into (set empty to disable)")
flag.Parse()
if strings.TrimSpace(target) == "" {
fmt.Fprintln(os.Stderr, "-target is required")
flag.Usage()
os.Exit(2)
}
client := &http.Client{Timeout: timeout}
ctx := context.Background()
// Report metadata
started := time.Now()
meta := report.Metadata{StartedAt: started.UTC().Format(time.RFC3339)}
params := report.Params{
MaxDepth: maxDepth,
Concurrency: concurrency,
TimeoutMs: timeout.Milliseconds(),
UserAgent: userAgent,
SameHostOnly: sameHostOnly,
}
fmt.Fprintf(os.Stderr, "Starting crawl of %s (depth: %d)...\n", target, maxDepth)
// Setup progress counters
var urlsVisited, urlsErrored atomic.Int64
var currentURL atomic.Value // string
var pendingTasks atomic.Int64
// Start progress reporter if not in quiet mode
ctxWithCancel, cancel := context.WithCancel(ctx)
defer cancel()
if !quiet {
go func() {
ticker := time.NewTicker(500 * time.Millisecond)
defer ticker.Stop()
for {
select {
case <-ticker.C:
cu, _ := currentURL.Load().(string)
fmt.Fprintf(os.Stderr, "\rURLs visited: %d | Errors: %d | Pending: %d | Current: %s",
urlsVisited.Load(), urlsErrored.Load(), pendingTasks.Load(), truncateForTTY(cu, 90))
case <-ctxWithCancel.Done():
return
}
}
}()
}
// Progress callback functions
visitedCallback := func(u string, depth int, pending int) {
urlsVisited.Add(1)
pendingTasks.Store(int64(pending))
currentURL.Store(u)
}
errorCallback := func(u string, err error, pending int) {
urlsErrored.Add(1)
pendingTasks.Store(int64(pending))
currentURL.Store(u)
}
visited, crawlErrs, outlinks, pageInfo := crawler.Crawl(ctx, target, maxDepth, concurrency, sameHostOnly, client, userAgent, visitedCallback, errorCallback)
// Clear progress line before moving to next phase
if !quiet {
fmt.Fprintf(os.Stderr, "\rCrawl complete! URLs visited: %d | Errors: %d\n",
urlsVisited.Load(), urlsErrored.Load())
}
fmt.Fprintf(os.Stderr, "Fetching sitemap...\n")
smURLs, err := sitemap.FetchAll(ctx, target, client, userAgent)
if err != nil && !errors.Is(err, sitemap.ErrNotFound) {
fmt.Fprintf(os.Stderr, "sitemap error: %v\n", err)
}
// Robots.txt summary (simple)
robots := report.RobotsSummary{}
robotsURL := urlutil.Origin(target) + "/robots.txt"
{
req, _ := http.NewRequestWithContext(ctx, http.MethodGet, robotsURL, nil)
req.Header.Set("User-Agent", userAgent)
resp, err := client.Do(req)
if err == nil {
defer resp.Body.Close()
if resp.StatusCode == http.StatusOK {
robots.Present = true
robots.FetchedAt = time.Now().UTC().Format(time.RFC3339)
}
}
}
// Build set of all unique links discovered across pages for status checks
allLinks := make(map[string]struct{})
for _, m := range outlinks {
for u := range m {
allLinks[u] = struct{}{}
}
}
// Also include the visited pages themselves
for u := range visited {
allLinks[u] = struct{}{}
}
fmt.Fprintf(os.Stderr, "Checking %d links...\n", len(allLinks))
// Reset counters for link checking
urlsVisited.Store(0)
urlsErrored.Store(0)
// Progress callback functions for link checking
linkCheckCallback := func(ok bool) {
if ok {
urlsVisited.Add(1)
} else {
urlsErrored.Add(1)
}
}
checkResults := linkcheck.Check(ctx, allLinks, concurrency, client, userAgent, !quiet, linkCheckCallback)
// Clear progress line before finishing
if !quiet {
fmt.Fprintf(os.Stderr, "\rLink checking complete! OK: %d | Errors: %d\n",
urlsVisited.Load(), urlsErrored.Load())
}
finished := time.Now()
meta.FinishedAt = finished.UTC().Format(time.RFC3339)
meta.DurationMs = finished.Sub(started).Milliseconds()
fmt.Fprintf(os.Stderr, "Building report...\n")
// Convert pageInfo to report.PageMeta
pages := make(map[string]report.PageMeta, len(pageInfo))
for u, pi := range pageInfo {
pages[u] = report.PageMeta{
Title: pi.Title,
ResponseTimeMs: pi.ResponseTimeMs,
ContentLength: pi.ContentLength,
Depth: pi.Depth,
}
}
reports := report.Build(target, visited, smURLs, crawlErrs, checkResults, outlinks, meta, params, pages, robots)
if exportDir != "" {
if err := exportAll(exportDir, reports); err != nil {
fmt.Fprintf(os.Stderr, "export error: %v\n", err)
}
}
// Save JSON report to ./reports/<host>.json by default (ignored by git)
if err := saveReportJSON("reports", reports); err != nil {
fmt.Fprintf(os.Stderr, "save report error: %v\n", err)
}
switch output {
case "json":
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
_ = enc.Encode(reports)
default:
report.PrintText(os.Stdout, reports)
}
}
// truncateForTTY truncates s to max characters, replacing the tail with … if needed.
func truncateForTTY(s string, max int) string {
if max <= 0 || len(s) <= max {
return s
}
if max <= 1 {
return "…"
}
return s[:max-1] + "…"
}
func exportAll(baseDir string, r report.Report) error {
u, err := url.Parse(r.Target)
if err != nil || u.Host == "" {
return fmt.Errorf("invalid target for export: %s", r.Target)
}
dir := filepath.Join(baseDir, u.Host)
if err := os.MkdirAll(dir, 0o755); err != nil {
return err
}
if err := exportCSVPages(filepath.Join(dir, "pages.csv"), r); err != nil {
return err
}
if err := exportCSVLinks(filepath.Join(dir, "links.csv"), r); err != nil {
return err
}
if err := exportNDJSON(filepath.Join(dir, "pages.ndjson"), pagesToNDJSON(r)); err != nil {
return err
}
if err := exportNDJSON(filepath.Join(dir, "links.ndjson"), linksToNDJSON(r)); err != nil {
return err
}
if err := exportNDJSON(filepath.Join(dir, "link_statuses.ndjson"), linkStatusesToNDJSON(r)); err != nil {
return err
}
return nil
}
func exportCSVPages(path string, r report.Report) error {
f, err := os.Create(path)
if err != nil {
return err
}
defer f.Close()
w := csv.NewWriter(f)
defer w.Flush()
_ = w.Write([]string{"url", "title", "responseTimeMs", "contentLength", "depth"})
for u, pm := range r.Pages {
rec := []string{u, pm.Title, fmt.Sprintf("%d", pm.ResponseTimeMs), fmt.Sprintf("%d", pm.ContentLength), fmt.Sprintf("%d", pm.Depth)}
_ = w.Write(rec)
}
return w.Error()
}
func exportCSVLinks(path string, r report.Report) error {
f, err := os.Create(path)
if err != nil {
return err
}
defer f.Close()
w := csv.NewWriter(f)
defer w.Flush()
_ = w.Write([]string{"sourceUrl", "targetUrl"})
for src, lst := range r.PageOutlinks {
for _, dst := range lst {
_ = w.Write([]string{src, dst})
}
}
return w.Error()
}
type ndjsonItem interface{}
func exportNDJSON(path string, items []ndjsonItem) error {
f, err := os.Create(path)
if err != nil {
return err
}
defer f.Close()
enc := json.NewEncoder(f)
for _, it := range items {
if err := enc.Encode(it); err != nil {
return err
}
}
return nil
}
func pagesToNDJSON(r report.Report) []ndjsonItem {
res := make([]ndjsonItem, 0, len(r.Pages))
for u, pm := range r.Pages {
res = append(res, map[string]any{
"type": "page",
"url": u,
"title": pm.Title,
"responseTimeMs": pm.ResponseTimeMs,
"contentLength": pm.ContentLength,
"depth": pm.Depth,
})
}
return res
}
func linksToNDJSON(r report.Report) []ndjsonItem {
var res []ndjsonItem
for src, lst := range r.PageOutlinks {
for _, dst := range lst {
res = append(res, map[string]any{
"type": "link",
"src": src,
"dest": dst,
})
}
}
return res
}
func linkStatusesToNDJSON(r report.Report) []ndjsonItem {
res := make([]ndjsonItem, 0, len(r.LinkStatuses))
for _, ls := range r.LinkStatuses {
res = append(res, map[string]any{
"type": "link_status",
"url": ls.URL,
"statusCode": ls.StatusCode,
"ok": ls.OK,
"error": ls.Err,
})
}
return res
}
func saveReportJSON(baseDir string, r report.Report) error {
u, err := url.Parse(r.Target)
if err != nil || u.Host == "" {
return fmt.Errorf("invalid target for save: %s", r.Target)
}
if err := os.MkdirAll(baseDir, 0o755); err != nil {
return err
}
path := filepath.Join(baseDir, u.Host+".json")
f, err := os.Create(path)
if err != nil {
return err
}
defer f.Close()
enc := json.NewEncoder(f)
enc.SetIndent("", " ")
return enc.Encode(r)
}