feat: initial import with progress UI; docs: REPORT_SCHEMA, TODO; v0.0.1

2025-08-31 09:19:34 -04:00 · 2025-08-31 09:19:34 -04:00 · e7b4d33971
commit e7b4d33971
16 changed files with 8783 additions and 0 deletions
--- a/TODO.md
+++ b/TODO.md
@ -0,0 +1,45 @@
+## Roadmap (post v0.0.1)
+
+Prioritized from easiest/low-risk to more involved work. Check off as we ship.
+
+### Quick wins (target v0.0.2)
+- [ ] Add crawl metadata (startedAt, finishedAt, durationMs)
+- [ ] Include run parameters in report (maxDepth, concurrency, timeout, userAgent, sameHostOnly)
+- [ ] Status histogram (2xx/3xx/4xx/5xx totals) in summary
+- [ ] Normalize and dedupe trailing `/.` URL variants in output
+- [ ] Add compact `reportSummary` text block to JSON
+- [ ] Top external domains with counts
+- [ ] Broken links sample (first N) + per-domain broken counts
+
+### Moderate scope
+- [ ] Robots.txt summary (present, fetchedAt, sample disallow rules)
+- [ ] Sitemap extras (index → child sitemaps, fetch errors)
+- [ ] Per-page response time (responseTimeMs) and content length
+- [ ] Basic page metadata: `<title>`, canonical (if present)
+- [ ] Depth distribution (count of pages by depth)
+- [ ] Duplicate title/canonical detection (lists of URLs)
+
+### Content/asset analysis
+- [ ] Extract assets (images/css/js) per page with status/type/size
+- [ ] Mixed-content detection (http assets on https pages)
+- [ ] Image accessibility metric (alt present ratio)
+
+### Security and quality signals
+- [ ] Security headers by host (HSTS, CSP, X-Frame-Options, Referrer-Policy)
+- [ ] Insecure forms (http action on https page)
+- [ ] Large pages and slow pages (p95 thresholds) summary
+
+### Link behavior and graph
+- [ ] Redirect map (from → to, hops; count summary)
+- [ ] Indegree/outdegree stats; small graph summary
+
+### Outputs and UX
+- [ ] CSV exports: pages.csv, links.csv, assets.csv
+- [ ] NDJSON export option for streaming pipelines
+- [ ] Optional: include file/line anchors in JSON for large outputs
+
+### Notes
+- Keep JSON stable and sorted; avoid breaking changes. If we change fields, bump minor version and document in `reports/REPORT_SCHEMA.md`.
+- Favor opt-in flags for heavier analyses (assets, headers) to keep default runs fast.
+
+
--- a/go.mod
+++ b/go.mod
@ -0,0 +1,5 @@
+module urlcrawler
+
+go 1.22
+
+require golang.org/x/net v0.29.0
--- a/go.sum
+++ b/go.sum
@ -0,0 +1,2 @@
+golang.org/x/net v0.29.0 h1:5ORfpBpCs4HzDYoodCDBbwHzdR5UrLBZ3sOnUJmFoHo=
+golang.org/x/net v0.29.0/go.mod h1:gLkgy8jTGERgjzMic6DS9+SP0ajcu6Xu3Orq/SpETg0=
--- a/internal/crawler/crawler.go
+++ b/internal/crawler/crawler.go
@ -0,0 +1,145 @@
+package crawler
+
+import (
+	"context"
+	"io"
+	"net/http"
+	"sync"
+
+	"urlcrawler/internal/htmlx"
+	"urlcrawler/internal/urlutil"
+)
+
+type task struct {
+	url   string
+	depth int
+}
+
+// Crawl visits pages up to maxDepth and returns visited set, per-URL errors, and per-page outgoing links.
+// The visitedCallback and errorCallback functions are called when a page is successfully visited or encounters an error.
+// visitedCallback receives the URL, its depth, and the current number of pending tasks in the queue.
+// errorCallback receives the URL, the error, and the current number of pending tasks in the queue.
+func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int, sameHostOnly bool, client *http.Client, userAgent string, visitedCallback func(string, int, int), errorCallback func(string, error, int)) (map[string]struct{}, map[string]error, map[string]map[string]struct{}) {
+	visited := make(map[string]struct{})
+	errs := make(map[string]error)
+	outlinks := make(map[string]map[string]struct{})
+	var mu sync.Mutex
+
+	origin := urlutil.Origin(startURL)
+
+	tasks := make(chan task, concurrency*2)
+	wgWorkers := sync.WaitGroup{}
+	wgTasks := sync.WaitGroup{}
+
+	enqueue := func(t task) {
+		wgTasks.Add(1)
+		tasks <- t
+	}
+
+	worker := func() {
+		defer wgWorkers.Done()
+		for tk := range tasks {
+			if ctx.Err() != nil {
+				wgTasks.Done()
+				return
+			}
+			mu.Lock()
+			if _, seen := visited[tk.url]; seen {
+				mu.Unlock()
+				wgTasks.Done()
+				continue
+			}
+			visited[tk.url] = struct{}{}
+			mu.Unlock()
+
+			if visitedCallback != nil {
+				visitedCallback(tk.url, tk.depth, len(tasks))
+			}
+
+			req, _ := http.NewRequestWithContext(ctx, http.MethodGet, tk.url, nil)
+			req.Header.Set("User-Agent", userAgent)
+			resp, err := client.Do(req)
+			if err != nil {
+				mu.Lock()
+				errs[tk.url] = err
+				mu.Unlock()
+
+				if errorCallback != nil {
+					errorCallback(tk.url, err, len(tasks))
+				}
+				wgTasks.Done()
+				continue
+			}
+			func() {
+				defer resp.Body.Close()
+				ct := resp.Header.Get("Content-Type")
+				if resp.StatusCode != http.StatusOK || ct == "" || (ct != "text/html" && !hasPrefix(ct, "text/html")) {
+					return
+				}
+				body, _ := io.ReadAll(resp.Body)
+				hrefs := htmlx.ExtractAnchors(stringsReader(string(body)))
+				var toEnqueue []string
+				for _, href := range hrefs {
+					abs, ok := urlutil.Normalize(tk.url, href)
+					if !ok {
+						continue
+					}
+					mu.Lock()
+					m, ok2 := outlinks[tk.url]
+					if !ok2 {
+						m = make(map[string]struct{})
+						outlinks[tk.url] = m
+					}
+					m[abs] = struct{}{}
+					mu.Unlock()
+
+					if tk.depth < maxDepth {
+						if !sameHostOnly || urlutil.SameHost(origin, abs) {
+							toEnqueue = append(toEnqueue, abs)
+						}
+					}
+				}
+				for _, u := range toEnqueue {
+					enqueue(task{url: u, depth: tk.depth + 1})
+				}
+			}()
+			wgTasks.Done()
+		}
+	}
+
+	for i := 0; i < concurrency; i++ {
+		wgWorkers.Add(1)
+		go worker()
+	}
+
+	// Close the tasks channel when all enqueued tasks are processed.
+	go func() {
+		wgTasks.Wait()
+		close(tasks)
+	}()
+
+	enqueue(task{url: startURL, depth: 0})
+	wgWorkers.Wait()
+
+	return visited, errs, outlinks
+}
+
+func hasPrefix(s string, prefix string) bool {
+	return len(s) >= len(prefix) && s[:len(prefix)] == prefix
+}
+
+// stringsReader avoids importing strings at package top for a single use.
+func stringsReader(s string) io.Reader {
+	return &stringReader{str: s}
+}
+
+type stringReader struct{ str string }
+
+func (r *stringReader) Read(p []byte) (int, error) {
+	if len(r.str) == 0 {
+		return 0, io.EOF
+	}
+	n := copy(p, r.str)
+	r.str = r.str[n:]
+	return n, nil
+}
--- a/internal/htmlx/htmlx.go
+++ b/internal/htmlx/htmlx.go
@ -0,0 +1,38 @@
+package htmlx
+
+import (
+	"io"
+	"strings"
+
+	"golang.org/x/net/html"
+)
+
+// ExtractAnchors returns all hrefs from <a> tags.
+func ExtractAnchors(r io.Reader) []string {
+	tokens := html.NewTokenizer(r)
+	var hrefs []string
+	for {
+		t := tokens.Next()
+		switch t {
+		case html.StartTagToken, html.SelfClosingTagToken:
+			tn, hasAttr := tokens.TagName()
+			if string(tn) != "a" || !hasAttr {
+				continue
+			}
+			for {
+				key, val, more := tokens.TagAttr()
+				if string(key) == "href" {
+					v := strings.TrimSpace(string(val))
+					if v != "" {
+						hrefs = append(hrefs, v)
+					}
+				}
+				if !more {
+					break
+				}
+			}
+		case html.ErrorToken:
+			return hrefs
+		}
+	}
+}
--- a/internal/linkcheck/linkcheck.go
+++ b/internal/linkcheck/linkcheck.go
@ -0,0 +1,84 @@
+package linkcheck
+
+import (
+	"context"
+	"net/http"
+	"sync"
+)
+
+type LinkStatus struct {
+	URL        string `json:"url"`
+	StatusCode int    `json:"statusCode"`
+	OK         bool   `json:"ok"`
+	Err        string `json:"error,omitempty"`
+}
+
+type Results struct {
+	Statuses []LinkStatus `json:"statuses"`
+}
+
+func Check(ctx context.Context, urls map[string]struct{}, concurrency int, client *http.Client, userAgent string, showProgress bool, progressCallback func(bool)) Results {
+	var mu sync.Mutex
+	var statuses []LinkStatus
+
+	type job struct{ u string }
+	jobs := make(chan job, concurrency*2)
+	wg := sync.WaitGroup{}
+
+	worker := func() {
+		defer wg.Done()
+		for j := range jobs {
+			status, err := headOrGet(ctx, client, userAgent, j.u)
+			ls := LinkStatus{URL: j.u, StatusCode: status}
+			if err != nil {
+				ls.Err = err.Error()
+			}
+			ls.OK = err == nil && status < 400 && status >= 200
+			mu.Lock()
+			statuses = append(statuses, ls)
+			mu.Unlock()
+
+			if progressCallback != nil {
+				progressCallback(ls.OK)
+			}
+		}
+	}
+
+	for i := 0; i < concurrency; i++ {
+		wg.Add(1)
+		go worker()
+	}
+
+	for u := range urls {
+		jobs <- job{u: u}
+	}
+	close(jobs)
+	wg.Wait()
+
+	return Results{Statuses: statuses}
+}
+
+func headOrGet(ctx context.Context, client *http.Client, userAgent string, u string) (int, error) {
+	req, _ := http.NewRequestWithContext(ctx, http.MethodHead, u, nil)
+	req.Header.Set("User-Agent", userAgent)
+	resp, err := client.Do(req)
+	if err == nil {
+		defer resp.Body.Close()
+		if resp.StatusCode >= 200 && resp.StatusCode < 400 {
+			return resp.StatusCode, nil
+		}
+		// Some servers don't support HEAD properly; fall back to GET for 4xx/405
+		if resp.StatusCode != http.StatusMethodNotAllowed && resp.StatusCode < 500 {
+			return resp.StatusCode, nil
+		}
+	}
+	// Fallback GET
+	req2, _ := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
+	req2.Header.Set("User-Agent", userAgent)
+	resp2, err2 := client.Do(req2)
+	if err2 != nil {
+		return 0, err2
+	}
+	defer resp2.Body.Close()
+	return resp2.StatusCode, nil
+}
--- a/internal/report/report.go
+++ b/internal/report/report.go
@ -0,0 +1,96 @@
+package report
+
+import (
+	"fmt"
+	"io"
+	"sort"
+
+	"urlcrawler/internal/linkcheck"
+)
+
+type Report struct {
+	Target              string                 `json:"target"`
+	CrawledURLs         []string               `json:"crawledUrls"`
+	SitemapURLs         []string               `json:"sitemapUrls,omitempty"`
+	CrawlErrors         map[string]string      `json:"crawlErrors,omitempty"`
+	LinkStatuses        []linkcheck.LinkStatus `json:"linkStatuses"`
+	PageOutlinks        map[string][]string    `json:"pageOutlinks"`
+	LinkSources         map[string][]string    `json:"linkSources"`
+	MissingInSitemap    []string               `json:"missingInSitemap,omitempty"`
+	InSitemapNotCrawled []string               `json:"inSitemapNotCrawled,omitempty"`
+}
+
+func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}) Report {
+	crawledList := keys(crawled)
+	sitemapList := keys(sitemap)
+	crawlErrMap := make(map[string]string, len(crawlErrs))
+	for k, v := range crawlErrs {
+		crawlErrMap[k] = v.Error()
+	}
+
+	missing := difference(crawled, sitemap)
+	missingList := keys(missing)
+	inSmNotCrawled := difference(sitemap, crawled)
+	inSmNotCrawledList := keys(inSmNotCrawled)
+
+	pageOut := make(map[string][]string, len(outlinks))
+	linkSrc := make(map[string][]string)
+	for page, set := range outlinks {
+		lst := keys(set)
+		pageOut[page] = lst
+		for _, u := range lst {
+			linkSrc[u] = append(linkSrc[u], page)
+		}
+	}
+
+	return Report{
+		Target:              target,
+		CrawledURLs:         crawledList,
+		SitemapURLs:         sitemapList,
+		CrawlErrors:         crawlErrMap,
+		LinkStatuses:        check.Statuses,
+		PageOutlinks:        pageOut,
+		LinkSources:         linkSrc,
+		MissingInSitemap:    missingList,
+		InSitemapNotCrawled: inSmNotCrawledList,
+	}
+}
+
+func PrintText(w io.Writer, r Report) {
+	fmt.Fprintf(w, "Target: %s\n\n", r.Target)
+	fmt.Fprintf(w, "Crawled URLs: %d\n", len(r.CrawledURLs))
+	fmt.Fprintf(w, "Sitemap URLs: %d\n", len(r.SitemapURLs))
+	fmt.Fprintf(w, "Links checked: %d\n", len(r.LinkStatuses))
+	fmt.Fprintf(w, "Missing in sitemap: %d\n", len(r.MissingInSitemap))
+	fmt.Fprintf(w, "In sitemap not crawled: %d\n\n", len(r.InSitemapNotCrawled))
+
+	// Keep text output concise; details available in JSON
+}
+
+func keys[T comparable](m map[T]struct{}) []T {
+	res := make([]T, 0, len(m))
+	for k := range m {
+		res = append(res, k)
+	}
+	sort.Slice(res, func(i, j int) bool { return asString(res[i]) < asString(res[j]) })
+	return res
+}
+
+func asString[T any](v T) string {
+	switch x := any(v).(type) {
+	case string:
+		return x
+	default:
+		return fmt.Sprintf("%v", v)
+	}
+}
+
+func difference(a, b map[string]struct{}) map[string]struct{} {
+	res := make(map[string]struct{})
+	for k := range a {
+		if _, ok := b[k]; !ok {
+			res[k] = struct{}{}
+		}
+	}
+	return res
+}
--- a/internal/sitemap/sitemap.go
+++ b/internal/sitemap/sitemap.go
@ -0,0 +1,110 @@
+package sitemap
+
+import (
+	"compress/gzip"
+	"context"
+	"encoding/xml"
+	"errors"
+	"io"
+	"net/http"
+	"strings"
+
+	"urlcrawler/internal/urlutil"
+)
+
+var ErrNotFound = errors.New("sitemap not found")
+
+// FetchAll attempts to fetch /sitemap.xml and /sitemap_index.xml, parse URLs, and follow indexes.
+func FetchAll(ctx context.Context, target string, client *http.Client, userAgent string) (map[string]struct{}, error) {
+	origin := urlutil.Origin(target)
+	candidates := []string{origin + "/sitemap.xml", origin + "/sitemap_index.xml"}
+	found := make(map[string]struct{})
+	var any bool
+	for _, u := range candidates {
+		urls, err := fetchOne(ctx, u, client, userAgent)
+		if err == nil && len(urls) > 0 {
+			any = true
+			for v := range urls {
+				found[v] = struct{}{}
+			}
+		}
+	}
+	if !any {
+		return found, ErrNotFound
+	}
+	return found, nil
+}
+
+func fetchOne(ctx context.Context, u string, client *http.Client, userAgent string) (map[string]struct{}, error) {
+	req, _ := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
+	req.Header.Set("User-Agent", userAgent)
+	resp, err := client.Do(req)
+	if err != nil {
+		return nil, err
+	}
+	defer resp.Body.Close()
+	if resp.StatusCode != http.StatusOK {
+		return nil, errors.New(resp.Status)
+	}
+	var r io.Reader = resp.Body
+	if strings.HasSuffix(strings.ToLower(u), ".gz") || strings.Contains(strings.ToLower(resp.Header.Get("Content-Type")), "gzip") {
+		gz, err := gzip.NewReader(resp.Body)
+		if err == nil {
+			r = gz
+			defer gz.Close()
+		}
+	}
+	data, _ := io.ReadAll(r)
+	ct := strings.ToLower(resp.Header.Get("Content-Type"))
+	if strings.Contains(ct, "/xml") || strings.Contains(ct, "+xml") || strings.HasSuffix(strings.ToLower(u), ".xml") || true {
+		return parseSitemapXML(ctx, client, userAgent, data)
+	}
+	return nil, errors.New("unsupported sitemap content-type")
+}
+
+func parseSitemapXML(ctx context.Context, client *http.Client, userAgent string, data []byte) (map[string]struct{}, error) {
+	type urlEntry struct {
+		Loc string `xml:"loc"`
+	}
+	type urlSet struct {
+		URLs []urlEntry `xml:"url"`
+	}
+	type indexEntry struct {
+		Loc string `xml:"loc"`
+	}
+	type siteIndex struct {
+		Sitemaps []indexEntry `xml:"sitemap"`
+	}
+
+	found := make(map[string]struct{})
+
+	// First try urlset
+	var us urlSet
+	if err := xml.Unmarshal(data, &us); err == nil && len(us.URLs) > 0 {
+		for _, e := range us.URLs {
+			loc := strings.TrimSpace(e.Loc)
+			if loc != "" {
+				found[loc] = struct{}{}
+			}
+		}
+		return found, nil
+	}
+	// Then try index
+	var si siteIndex
+	if err := xml.Unmarshal(data, &si); err == nil && len(si.Sitemaps) > 0 {
+		for _, e := range si.Sitemaps {
+			loc := strings.TrimSpace(e.Loc)
+			if loc == "" {
+				continue
+			}
+			child, err := fetchOne(ctx, loc, client, userAgent)
+			if err == nil {
+				for v := range child {
+					found[v] = struct{}{}
+				}
+			}
+		}
+		return found, nil
+	}
+	return found, errors.New("unrecognized sitemap XML")
+}
--- a/internal/urlutil/urlutil.go
+++ b/internal/urlutil/urlutil.go
@ -0,0 +1,62 @@
+package urlutil
+
+import (
+	"net/url"
+	"path"
+	"strings"
+)
+
+// Normalize resolves href against base, strips fragments, and cleans path.
+func Normalize(baseURL string, href string) (string, bool) {
+	if href == "" {
+		return "", false
+	}
+	if strings.HasPrefix(href, "javascript:") || strings.HasPrefix(href, "mailto:") || strings.HasPrefix(href, "tel:") {
+		return "", false
+	}
+
+	b, err := url.Parse(baseURL)
+	if err != nil {
+		return "", false
+	}
+	u, err := url.Parse(href)
+	if err != nil {
+		return "", false
+	}
+	// Resolve relative links.
+	u = b.ResolveReference(u)
+	// Only http/https
+	if u.Scheme != "http" && u.Scheme != "https" {
+		return "", false
+	}
+	// Drop fragments
+	u.Fragment = ""
+	// Clean path
+	u.Path = path.Clean(u.Path)
+	return u.String(), true
+}
+
+// SameHost returns true if url shares the same host (including port) as base.
+func SameHost(baseURL string, candidate string) bool {
+	b, err := url.Parse(baseURL)
+	if err != nil {
+		return false
+	}
+	u, err := url.Parse(candidate)
+	if err != nil {
+		return false
+	}
+	return strings.EqualFold(b.Host, u.Host)
+}
+
+// Origin returns scheme://host of a URL.
+func Origin(raw string) string {
+	u, err := url.Parse(raw)
+	if err != nil {
+		return raw
+	}
+	u.Path = ""
+	u.RawQuery = ""
+	u.Fragment = ""
+	return u.Scheme + "://" + u.Host
+}
--- a/internal_pages.txt
+++ b/internal_pages.txt
@ -0,0 +1,7 @@
+https://www.blackswanstrength.com
+https://www.blackswanstrength.com/.
+https://www.blackswanstrength.com/booking
+https://www.blackswanstrength.com/client-testimonials
+https://www.blackswanstrength.com/home
+https://www.blackswanstrength.com/like-a-dog-chasing-prs
+https://www.blackswanstrength.com/online-coaching
--- a/internal_urls_all.txt
+++ b/internal_urls_all.txt
@ -0,0 +1,8 @@
+https://www.blackswanstrength.com
+https://www.blackswanstrength.com/
+https://www.blackswanstrength.com/.
+https://www.blackswanstrength.com/booking
+https://www.blackswanstrength.com/client-testimonials
+https://www.blackswanstrength.com/home
+https://www.blackswanstrength.com/like-a-dog-chasing-prs
+https://www.blackswanstrength.com/online-coaching
--- a/main.go
+++ b/main.go
@ -0,0 +1,162 @@
+package main
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"flag"
+	"fmt"
+	"net/http"
+	"os"
+	"strings"
+	"sync/atomic"
+	"time"
+
+	"urlcrawler/internal/crawler"
+	"urlcrawler/internal/linkcheck"
+	"urlcrawler/internal/report"
+	"urlcrawler/internal/sitemap"
+)
+
+func main() {
+	var target string
+	var concurrency int
+	var timeout time.Duration
+	var maxDepth int
+	var userAgent string
+	var sameHostOnly bool
+	var output string
+	var quiet bool
+
+	flag.StringVar(&target, "target", "", "Target site URL (e.g., https://example.com)")
+	flag.IntVar(&concurrency, "concurrency", 10, "Number of concurrent workers")
+	flag.DurationVar(&timeout, "timeout", 10*time.Second, "HTTP timeout per request")
+	flag.IntVar(&maxDepth, "max-depth", 2, "Maximum crawl depth (0=crawl only the start page)")
+	flag.StringVar(&userAgent, "user-agent", "urlcrawler/1.0", "User-Agent header value")
+	flag.BoolVar(&sameHostOnly, "same-host-only", true, "Limit crawl to the same host as target")
+	flag.StringVar(&output, "output", "text", "Output format: text|json")
+	flag.BoolVar(&quiet, "quiet", false, "Suppress progress output")
+	flag.Parse()
+
+	if strings.TrimSpace(target) == "" {
+		fmt.Fprintln(os.Stderr, "-target is required")
+		flag.Usage()
+		os.Exit(2)
+	}
+
+	client := &http.Client{Timeout: timeout}
+	ctx := context.Background()
+
+	fmt.Fprintf(os.Stderr, "Starting crawl of %s (depth: %d)...\n", target, maxDepth)
+
+	// Setup progress counters
+	var urlsVisited, urlsErrored atomic.Int64
+	var currentURL atomic.Value // string
+	var pendingTasks atomic.Int64
+
+	// Start progress reporter if not in quiet mode
+	ctxWithCancel, cancel := context.WithCancel(ctx)
+	defer cancel()
+
+	if !quiet {
+		go func() {
+			ticker := time.NewTicker(500 * time.Millisecond)
+			defer ticker.Stop()
+
+			for {
+				select {
+				case <-ticker.C:
+					cu, _ := currentURL.Load().(string)
+					fmt.Fprintf(os.Stderr, "\rURLs visited: %d | Errors: %d | Pending: %d | Current: %s",
+						urlsVisited.Load(), urlsErrored.Load(), pendingTasks.Load(), truncateForTTY(cu, 90))
+				case <-ctxWithCancel.Done():
+					return
+				}
+			}
+		}()
+	}
+
+	// Progress callback functions
+	visitedCallback := func(u string, depth int, pending int) {
+		urlsVisited.Add(1)
+		pendingTasks.Store(int64(pending))
+		currentURL.Store(u)
+	}
+	errorCallback := func(u string, err error, pending int) {
+		urlsErrored.Add(1)
+		pendingTasks.Store(int64(pending))
+		currentURL.Store(u)
+	}
+
+	visited, crawlErrs, outlinks := crawler.Crawl(ctx, target, maxDepth, concurrency, sameHostOnly, client, userAgent, visitedCallback, errorCallback)
+
+	// Clear progress line before moving to next phase
+	if !quiet {
+		fmt.Fprintf(os.Stderr, "\rCrawl complete! URLs visited: %d | Errors: %d\n",
+			urlsVisited.Load(), urlsErrored.Load())
+	}
+
+	fmt.Fprintf(os.Stderr, "Fetching sitemap...\n")
+	smURLs, err := sitemap.FetchAll(ctx, target, client, userAgent)
+	if err != nil && !errors.Is(err, sitemap.ErrNotFound) {
+		fmt.Fprintf(os.Stderr, "sitemap error: %v\n", err)
+	}
+
+	// Build set of all unique links discovered across pages for status checks
+	allLinks := make(map[string]struct{})
+	for _, m := range outlinks {
+		for u := range m {
+			allLinks[u] = struct{}{}
+		}
+	}
+	// Also include the visited pages themselves
+	for u := range visited {
+		allLinks[u] = struct{}{}
+	}
+
+	fmt.Fprintf(os.Stderr, "Checking %d links...\n", len(allLinks))
+
+	// Reset counters for link checking
+	urlsVisited.Store(0)
+	urlsErrored.Store(0)
+
+	// Progress callback functions for link checking
+	linkCheckCallback := func(ok bool) {
+		if ok {
+			urlsVisited.Add(1)
+		} else {
+			urlsErrored.Add(1)
+		}
+	}
+
+	checkResults := linkcheck.Check(ctx, allLinks, concurrency, client, userAgent, !quiet, linkCheckCallback)
+
+	// Clear progress line before finishing
+	if !quiet {
+		fmt.Fprintf(os.Stderr, "\rLink checking complete! OK: %d | Errors: %d\n",
+			urlsVisited.Load(), urlsErrored.Load())
+	}
+
+	fmt.Fprintf(os.Stderr, "Building report...\n")
+	reports := report.Build(target, visited, smURLs, crawlErrs, checkResults, outlinks)
+
+	switch output {
+	case "json":
+		enc := json.NewEncoder(os.Stdout)
+		enc.SetIndent("", "  ")
+		_ = enc.Encode(reports)
+	default:
+		report.PrintText(os.Stdout, reports)
+	}
+}
+
+// truncateForTTY truncates s to max characters, replacing the tail with … if needed.
+func truncateForTTY(s string, max int) string {
+	if max <= 0 || len(s) <= max {
+		return s
+	}
+	if max <= 1 {
+		return "…"
+	}
+	return s[:max-1] + "…"
+}
--- a/report.json
+++ b/report.json
--- a/reports/REPORT_SCHEMA.md
+++ b/reports/REPORT_SCHEMA.md
@ -0,0 +1,59 @@
+## URLCrawler Report JSON Schema
+
+This document describes the structure of the JSON reports produced by `urlcrawler` when run with `-output json`.
+
+### Top-level object
+
+```json
+{
+  "target": "https://example.com",
+  "crawledUrls": ["https://example.com", "https://example.com/about"],
+  "sitemapUrls": ["https://example.com", "https://example.com/about"],
+  "crawlErrors": {"https://bad.example": "error string"},
+  "linkStatuses": [
+    {"url": "https://example.com", "statusCode": 200, "ok": true},
+    {"url": "https://other.example/broken", "statusCode": 404, "ok": false, "error": "..."}
+  ],
+  "pageOutlinks": {
+    "https://example.com": ["https://example.com/about", "https://other.example/"]
+  },
+  "linkSources": {
+    "https://example.com/about": ["https://example.com"]
+  },
+  "missingInSitemap": ["https://example.com/page-not-in-sitemap"],
+  "inSitemapNotCrawled": ["https://example.com/deferred"]
+}
+```
+
+### Fields
+
+- **target** (string): Normalized start URL used for the crawl.
+
+- **crawledUrls** (string[]): Unique URLs that were visited during crawling. Sorted for stability.
+
+- **sitemapUrls** (string[]; optional): All URLs discovered via `sitemap.xml` (and nested sitemaps). Present unless the sitemap is not found.
+
+- **crawlErrors** (object map<string,string>; optional): Maps URL → error message for requests that failed (e.g., network/TLS/timeouts). Only set when errors occurred.
+
+- **linkStatuses** (LinkStatus[]): Result of HTTP status checks for all unique links discovered (including the pages themselves).
+  - **url** (string): The checked URL.
+  - **statusCode** (number): HTTP status code (0 if request failed before a response was received).
+  - **ok** (boolean): Convenience flag, true when `200 ≤ statusCode < 400` and no error occurred.
+  - **error** (string; optional): Error string when a request failed or there was another client error.
+
+- **pageOutlinks** (object map<string,string[]>): For each crawled page URL, the list of normalized outgoing links (internal and external).
+
+- **linkSources** (object map<string,string[]>): Inverse index: for each discovered link URL, the list of page URLs where it appeared.
+
+- **missingInSitemap** (string[]; optional): URLs that were crawled but not present in the sitemap.
+
+- **inSitemapNotCrawled** (string[]; optional): URLs present in the sitemap that were not crawled (e.g., due to depth limits or off-host rules).
+
+### Notes
+
+- URLs are normalized and deduplicated during crawl.
+- Content-type filtering: only `text/html` pages are parsed for outlinks.
+- Sitemap fetching is best-effort; absence is not treated as an error.
+- The JSON lists are sorted to produce stable outputs across runs.
+
+
--- a/reports/titan-training.ca.json
+++ b/reports/titan-training.ca.json
@ -0,0 +1,290 @@
+{
+  "target": "https://titan-training.ca",
+  "crawledUrls": [
+    "https://titan-training.ca",
+    "https://titan-training.ca/",
+    "https://titan-training.ca/.",
+    "https://titan-training.ca/cdn-cgi/l/email-protection",
+    "https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
+    "https://titan-training.ca/product-details/product/681331db52e2115c63435275",
+    "https://titan-training.ca/products-list",
+    "https://titan-training.ca/titan-training.ca"
+  ],
+  "sitemapUrls": [
+    "https://titan-training.ca/home",
+    "https://titan-training.ca/test_path?item=123"
+  ],
+  "linkStatuses": [
+    {
+      "url": "https://titan-training.ca/products-list",
+      "statusCode": 200,
+      "ok": true
+    },
+    {
+      "url": "https://titan-training.ca/",
+      "statusCode": 200,
+      "ok": true
+    },
+    {
+      "url": "https://www.cloudflare.com/5xx-error-landing",
+      "statusCode": 200,
+      "ok": true
+    },
+    {
+      "url": "https://titan-training.ca",
+      "statusCode": 200,
+      "ok": true
+    },
+    {
+      "url": "https://www.facebook.com/titantrainingkw",
+      "statusCode": 200,
+      "ok": true
+    },
+    {
+      "url": "https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public",
+      "statusCode": 200,
+      "ok": true
+    },
+    {
+      "url": "https://developers.cloudflare.com/waf/tools/scrape-shield/email-address-obfuscation",
+      "statusCode": 200,
+      "ok": true
+    },
+    {
+      "url": "https://developers.cloudflare.com/fundamentals/setup/account/create-account",
+      "statusCode": 200,
+      "ok": true
+    },
+    {
+      "url": "https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
+      "statusCode": 200,
+      "ok": true
+    },
+    {
+      "url": "https://www.cloudflare.com/sign-up?utm_source=email_protection",
+      "statusCode": 403,
+      "ok": false
+    },
+    {
+      "url": "https://titan-training.ca/cdn-cgi/l/email-protection",
+      "statusCode": 200,
+      "ok": true
+    },
+    {
+      "url": "https://titan-training.ca/titan-training.ca",
+      "statusCode": 200,
+      "ok": true
+    },
+    {
+      "url": "https://titan-training.ca/product-details/product/681331db52e2115c63435275",
+      "statusCode": 200,
+      "ok": true
+    },
+    {
+      "url": "https://titan-training.ca/.",
+      "statusCode": 200,
+      "ok": true
+    },
+    {
+      "url": "https://www.google.com/maps/search?api=1\u0026query=Google\u0026query_place_id=ChIJwzSnW430K4gRU8zOBshqKAg",
+      "statusCode": 404,
+      "ok": false
+    },
+    {
+      "url": "https://www.instagram.com/titan__training",
+      "statusCode": 200,
+      "ok": true
+    },
+    {
+      "url": "https://titan-training.ca/product-details/product/titan-training.ca",
+      "statusCode": 200,
+      "ok": true
+    }
+  ],
+  "pageOutlinks": {
+    "https://titan-training.ca": [
+      "https://titan-training.ca/.",
+      "https://titan-training.ca/cdn-cgi/l/email-protection",
+      "https://titan-training.ca/products-list",
+      "https://titan-training.ca/titan-training.ca",
+      "https://www.facebook.com/titantrainingkw",
+      "https://www.google.com/maps/search?api=1\u0026query=Google\u0026query_place_id=ChIJwzSnW430K4gRU8zOBshqKAg",
+      "https://www.instagram.com/titan__training",
+      "https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public"
+    ],
+    "https://titan-training.ca/": [
+      "https://titan-training.ca/",
+      "https://titan-training.ca/cdn-cgi/l/email-protection",
+      "https://titan-training.ca/products-list",
+      "https://titan-training.ca/titan-training.ca",
+      "https://www.facebook.com/titantrainingkw",
+      "https://www.google.com/maps/search?api=1\u0026query=Google\u0026query_place_id=ChIJwzSnW430K4gRU8zOBshqKAg",
+      "https://www.instagram.com/titan__training",
+      "https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public"
+    ],
+    "https://titan-training.ca/.": [
+      "https://titan-training.ca/",
+      "https://titan-training.ca/cdn-cgi/l/email-protection",
+      "https://titan-training.ca/products-list",
+      "https://titan-training.ca/titan-training.ca",
+      "https://www.facebook.com/titantrainingkw",
+      "https://www.google.com/maps/search?api=1\u0026query=Google\u0026query_place_id=ChIJwzSnW430K4gRU8zOBshqKAg",
+      "https://www.instagram.com/titan__training",
+      "https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public"
+    ],
+    "https://titan-training.ca/cdn-cgi/l/email-protection": [
+      "https://developers.cloudflare.com/fundamentals/setup/account/create-account",
+      "https://developers.cloudflare.com/waf/tools/scrape-shield/email-address-obfuscation",
+      "https://www.cloudflare.com/5xx-error-landing",
+      "https://www.cloudflare.com/sign-up?utm_source=email_protection"
+    ],
+    "https://titan-training.ca/product-details/product/681330e25a7661691fe205c8": [
+      "https://titan-training.ca/",
+      "https://titan-training.ca/cdn-cgi/l/email-protection",
+      "https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
+      "https://titan-training.ca/product-details/product/titan-training.ca",
+      "https://titan-training.ca/products-list",
+      "https://www.facebook.com/titantrainingkw",
+      "https://www.instagram.com/titan__training",
+      "https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public"
+    ],
+    "https://titan-training.ca/product-details/product/681331db52e2115c63435275": [
+      "https://titan-training.ca/",
+      "https://titan-training.ca/cdn-cgi/l/email-protection",
+      "https://titan-training.ca/product-details/product/681331db52e2115c63435275",
+      "https://titan-training.ca/product-details/product/titan-training.ca",
+      "https://titan-training.ca/products-list",
+      "https://www.facebook.com/titantrainingkw",
+      "https://www.instagram.com/titan__training",
+      "https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public"
+    ],
+    "https://titan-training.ca/products-list": [
+      "https://titan-training.ca/",
+      "https://titan-training.ca/cdn-cgi/l/email-protection",
+      "https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
+      "https://titan-training.ca/product-details/product/681331db52e2115c63435275",
+      "https://titan-training.ca/products-list",
+      "https://titan-training.ca/titan-training.ca",
+      "https://www.facebook.com/titantrainingkw",
+      "https://www.instagram.com/titan__training",
+      "https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public"
+    ],
+    "https://titan-training.ca/titan-training.ca": [
+      "https://titan-training.ca/cdn-cgi/l/email-protection",
+      "https://titan-training.ca/products-list",
+      "https://titan-training.ca/titan-training.ca",
+      "https://www.facebook.com/titantrainingkw",
+      "https://www.google.com/maps/search?api=1\u0026query=Google\u0026query_place_id=ChIJwzSnW430K4gRU8zOBshqKAg",
+      "https://www.instagram.com/titan__training",
+      "https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public"
+    ]
+  },
+  "linkSources": {
+    "https://developers.cloudflare.com/fundamentals/setup/account/create-account": [
+      "https://titan-training.ca/cdn-cgi/l/email-protection"
+    ],
+    "https://developers.cloudflare.com/waf/tools/scrape-shield/email-address-obfuscation": [
+      "https://titan-training.ca/cdn-cgi/l/email-protection"
+    ],
+    "https://titan-training.ca/": [
+      "https://titan-training.ca/",
+      "https://titan-training.ca/.",
+      "https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
+      "https://titan-training.ca/product-details/product/681331db52e2115c63435275",
+      "https://titan-training.ca/products-list"
+    ],
+    "https://titan-training.ca/.": [
+      "https://titan-training.ca"
+    ],
+    "https://titan-training.ca/cdn-cgi/l/email-protection": [
+      "https://titan-training.ca/",
+      "https://titan-training.ca/.",
+      "https://titan-training.ca/titan-training.ca",
+      "https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
+      "https://titan-training.ca/product-details/product/681331db52e2115c63435275",
+      "https://titan-training.ca",
+      "https://titan-training.ca/products-list"
+    ],
+    "https://titan-training.ca/product-details/product/681330e25a7661691fe205c8": [
+      "https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
+      "https://titan-training.ca/products-list"
+    ],
+    "https://titan-training.ca/product-details/product/681331db52e2115c63435275": [
+      "https://titan-training.ca/product-details/product/681331db52e2115c63435275",
+      "https://titan-training.ca/products-list"
+    ],
+    "https://titan-training.ca/product-details/product/titan-training.ca": [
+      "https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
+      "https://titan-training.ca/product-details/product/681331db52e2115c63435275"
+    ],
+    "https://titan-training.ca/products-list": [
+      "https://titan-training.ca/",
+      "https://titan-training.ca/.",
+      "https://titan-training.ca/titan-training.ca",
+      "https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
+      "https://titan-training.ca/product-details/product/681331db52e2115c63435275",
+      "https://titan-training.ca",
+      "https://titan-training.ca/products-list"
+    ],
+    "https://titan-training.ca/titan-training.ca": [
+      "https://titan-training.ca/",
+      "https://titan-training.ca/.",
+      "https://titan-training.ca/titan-training.ca",
+      "https://titan-training.ca",
+      "https://titan-training.ca/products-list"
+    ],
+    "https://www.cloudflare.com/5xx-error-landing": [
+      "https://titan-training.ca/cdn-cgi/l/email-protection"
+    ],
+    "https://www.cloudflare.com/sign-up?utm_source=email_protection": [
+      "https://titan-training.ca/cdn-cgi/l/email-protection"
+    ],
+    "https://www.facebook.com/titantrainingkw": [
+      "https://titan-training.ca/",
+      "https://titan-training.ca/.",
+      "https://titan-training.ca/titan-training.ca",
+      "https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
+      "https://titan-training.ca/product-details/product/681331db52e2115c63435275",
+      "https://titan-training.ca",
+      "https://titan-training.ca/products-list"
+    ],
+    "https://www.google.com/maps/search?api=1\u0026query=Google\u0026query_place_id=ChIJwzSnW430K4gRU8zOBshqKAg": [
+      "https://titan-training.ca/",
+      "https://titan-training.ca/.",
+      "https://titan-training.ca/titan-training.ca",
+      "https://titan-training.ca"
+    ],
+    "https://www.instagram.com/titan__training": [
+      "https://titan-training.ca/",
+      "https://titan-training.ca/.",
+      "https://titan-training.ca/titan-training.ca",
+      "https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
+      "https://titan-training.ca/product-details/product/681331db52e2115c63435275",
+      "https://titan-training.ca",
+      "https://titan-training.ca/products-list"
+    ],
+    "https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public": [
+      "https://titan-training.ca/",
+      "https://titan-training.ca/.",
+      "https://titan-training.ca/titan-training.ca",
+      "https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
+      "https://titan-training.ca/product-details/product/681331db52e2115c63435275",
+      "https://titan-training.ca",
+      "https://titan-training.ca/products-list"
+    ]
+  },
+  "missingInSitemap": [
+    "https://titan-training.ca",
+    "https://titan-training.ca/",
+    "https://titan-training.ca/.",
+    "https://titan-training.ca/cdn-cgi/l/email-protection",
+    "https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
+    "https://titan-training.ca/product-details/product/681331db52e2115c63435275",
+    "https://titan-training.ca/products-list",
+    "https://titan-training.ca/titan-training.ca"
+  ],
+  "inSitemapNotCrawled": [
+    "https://titan-training.ca/home",
+    "https://titan-training.ca/test_path?item=123"
+  ]
+}
--- a/BIN
+++ b/BIN