Compare commits

..

No commits in common. "main" and "v0.0.1" have entirely different histories.
main ... v0.0.1

13 changed files with 396 additions and 466 deletions

11
.gitignore vendored
View File

@ -1,11 +0,0 @@
# Binaries (root)
urlcrawler
# Local reports and exports (ignore by default)
reports/
exports/
# Misc
.DS_Store
*.log

57
TODO.md
View File

@ -1,28 +1,45 @@
## Roadmap (post v0.0.2)
## Roadmap (post v0.0.1)
Prioritized from easiest/low-risk to more involved work. Check off as we ship.
### Shipped in v0.0.2
- [x] Add crawl metadata (startedAt, finishedAt, durationMs)
- [x] Include run parameters in report (maxDepth, concurrency, timeout, userAgent, sameHostOnly)
- [x] Status histogram (2xx/3xx/4xx/5xx totals) in summary
- [x] Normalize and dedupe trailing `/.` URL variants in output
- [x] Add compact `reportSummary` text block to JSON
- [x] Top external domains with counts
- [x] Broken links sample (first N) + per-domain broken counts
- [x] Robots.txt summary (present, fetchedAt)
- [x] Sitemap extras (index → child sitemaps, fetch errors)
- [x] Per-page response time (responseTimeMs) and content length (basic)
- [x] Basic page metadata: `<title>`
- [x] Depth distribution (count of pages by depth)
- [x] Redirect map summary (from → to domain counts)
### Quick wins (target v0.0.2)
- [ ] Add crawl metadata (startedAt, finishedAt, durationMs)
- [ ] Include run parameters in report (maxDepth, concurrency, timeout, userAgent, sameHostOnly)
- [ ] Status histogram (2xx/3xx/4xx/5xx totals) in summary
- [ ] Normalize and dedupe trailing `/.` URL variants in output
- [ ] Add compact `reportSummary` text block to JSON
- [ ] Top external domains with counts
- [ ] Broken links sample (first N) + per-domain broken counts
### Next (target v0.0.3)
- [x] CSV exports: pages.csv, links.csv
- [x] NDJSON export option for streaming pipelines
### Moderate scope
- [ ] Robots.txt summary (present, fetchedAt, sample disallow rules)
- [ ] Sitemap extras (index → child sitemaps, fetch errors)
- [ ] Per-page response time (responseTimeMs) and content length
- [ ] Basic page metadata: `<title>`, canonical (if present)
- [ ] Depth distribution (count of pages by depth)
- [ ] Duplicate title/canonical detection (lists of URLs)
### Content/asset analysis
- [ ] Extract assets (images/css/js) per page with status/type/size
- [ ] Mixed-content detection (http assets on https pages)
- [ ] Image accessibility metric (alt present ratio)
### Security and quality signals
- [ ] Security headers by host (HSTS, CSP, X-Frame-Options, Referrer-Policy)
- [ ] Insecure forms (http action on https page)
- [ ] Large pages and slow pages (p95 thresholds) summary
### Link behavior and graph
- [ ] Redirect map (from → to, hops; count summary)
- [ ] Indegree/outdegree stats; small graph summary
### Outputs and UX
- [ ] CSV exports: pages.csv, links.csv, assets.csv
- [ ] NDJSON export option for streaming pipelines
- [ ] Optional: include file/line anchors in JSON for large outputs
### Notes
- All report metrics must be gathered by default with zero flags required.
- Keep JSON stable and sorted; update `reports/REPORT_SCHEMA.md` when fields change.
- Keep JSON stable and sorted; avoid breaking changes. If we change fields, bump minor version and document in `reports/REPORT_SCHEMA.md`.
- Favor opt-in flags for heavier analyses (assets, headers) to keep default runs fast.

View File

@ -1,24 +0,0 @@
#!/bin/bash
set -euo pipefail
APP=urlcrawler
DIST_DIR=dist
rm -rf "$DIST_DIR"
mkdir -p "$DIST_DIR"
echo "Building $APP for darwin/amd64..."
GOOS=darwin GOARCH=amd64 go build -o "$DIST_DIR/${APP}-darwin-amd64"
echo "Building $APP for darwin/arm64..."
GOOS=darwin GOARCH=arm64 go build -o "$DIST_DIR/${APP}-darwin-arm64"
echo "Building $APP for linux/amd64..."
GOOS=linux GOARCH=amd64 go build -o "$DIST_DIR/${APP}-linux-amd64"
echo "Building $APP for linux/arm64..."
GOOS=linux GOARCH=arm64 go build -o "$DIST_DIR/${APP}-linux-arm64"
echo "Done. Artifacts in $DIST_DIR/"

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -5,7 +5,6 @@ import (
"io"
"net/http"
"sync"
"time"
"urlcrawler/internal/htmlx"
"urlcrawler/internal/urlutil"
@ -16,22 +15,14 @@ type task struct {
depth int
}
type PageInfo struct {
Title string
ResponseTimeMs int64
ContentLength int
Depth int
}
// Crawl visits pages up to maxDepth and returns visited set, per-URL errors, and per-page outgoing links.
// The visitedCallback and errorCallback functions are called when a page is successfully visited or encounters an error.
// visitedCallback receives the URL, its depth, and the current number of pending tasks in the queue.
// errorCallback receives the URL, the error, and the current number of pending tasks in the queue.
func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int, sameHostOnly bool, client *http.Client, userAgent string, visitedCallback func(string, int, int), errorCallback func(string, error, int)) (map[string]struct{}, map[string]error, map[string]map[string]struct{}, map[string]PageInfo) {
func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int, sameHostOnly bool, client *http.Client, userAgent string, visitedCallback func(string, int, int), errorCallback func(string, error, int)) (map[string]struct{}, map[string]error, map[string]map[string]struct{}) {
visited := make(map[string]struct{})
errs := make(map[string]error)
outlinks := make(map[string]map[string]struct{})
pageInfos := make(map[string]PageInfo)
var mu sync.Mutex
origin := urlutil.Origin(startURL)
@ -65,14 +56,12 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
visitedCallback(tk.url, tk.depth, len(tasks))
}
start := time.Now()
req, _ := http.NewRequestWithContext(ctx, http.MethodGet, tk.url, nil)
req.Header.Set("User-Agent", userAgent)
resp, err := client.Do(req)
if err != nil {
mu.Lock()
errs[tk.url] = err
pageInfos[tk.url] = PageInfo{Title: "", ResponseTimeMs: time.Since(start).Milliseconds(), ContentLength: 0, Depth: tk.depth}
mu.Unlock()
if errorCallback != nil {
@ -84,20 +73,10 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
func() {
defer resp.Body.Close()
ct := resp.Header.Get("Content-Type")
// Default meta values
meta := PageInfo{Title: "", ResponseTimeMs: time.Since(start).Milliseconds(), ContentLength: 0, Depth: tk.depth}
if resp.ContentLength > 0 {
meta.ContentLength = int(resp.ContentLength)
}
if resp.StatusCode != http.StatusOK || ct == "" || (ct != "text/html" && !hasPrefix(ct, "text/html")) {
mu.Lock()
pageInfos[tk.url] = meta
mu.Unlock()
return
}
body, _ := io.ReadAll(resp.Body)
meta.ContentLength = len(body)
meta.Title = htmlx.ExtractTitle(stringsReader(string(body)))
hrefs := htmlx.ExtractAnchors(stringsReader(string(body)))
var toEnqueue []string
for _, href := range hrefs {
@ -123,9 +102,6 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
for _, u := range toEnqueue {
enqueue(task{url: u, depth: tk.depth + 1})
}
mu.Lock()
pageInfos[tk.url] = meta
mu.Unlock()
}()
wgTasks.Done()
}
@ -145,7 +121,7 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
enqueue(task{url: startURL, depth: 0})
wgWorkers.Wait()
return visited, errs, outlinks, pageInfos
return visited, errs, outlinks
}
func hasPrefix(s string, prefix string) bool {

View File

@ -36,22 +36,3 @@ func ExtractAnchors(r io.Reader) []string {
}
}
}
// ExtractTitle returns the text content of the first <title> element.
func ExtractTitle(r io.Reader) string {
tokens := html.NewTokenizer(r)
for {
switch tokens.Next() {
case html.StartTagToken:
name, _ := tokens.TagName()
if string(name) == "title" {
if tokens.Next() == html.TextToken {
t := strings.TrimSpace(string(tokens.Text()))
return t
}
}
case html.ErrorToken:
return ""
}
}
}

View File

@ -3,9 +3,7 @@ package report
import (
"fmt"
"io"
"net/url"
"sort"
"strings"
"urlcrawler/internal/linkcheck"
)
@ -20,133 +18,31 @@ type Report struct {
LinkSources map[string][]string `json:"linkSources"`
MissingInSitemap []string `json:"missingInSitemap,omitempty"`
InSitemapNotCrawled []string `json:"inSitemapNotCrawled,omitempty"`
Metadata Metadata `json:"metadata"`
Params Params `json:"params"`
Stats Stats `json:"stats"`
ReportSummary string `json:"reportSummary,omitempty"`
TopExternalDomains []DomainCount `json:"topExternalDomains,omitempty"`
BrokenSample []linkcheck.LinkStatus `json:"brokenSample,omitempty"`
BrokenByDomain []DomainCount `json:"brokenByDomain,omitempty"`
Pages map[string]PageMeta `json:"pages"`
DepthDistribution map[int]int `json:"depthDistribution"`
Robots RobotsSummary `json:"robots"`
}
type Metadata struct {
StartedAt string `json:"startedAt"` // RFC3339
FinishedAt string `json:"finishedAt"` // RFC3339
DurationMs int64 `json:"durationMs"`
}
type Params struct {
MaxDepth int `json:"maxDepth"`
Concurrency int `json:"concurrency"`
TimeoutMs int64 `json:"timeoutMs"`
UserAgent string `json:"userAgent"`
SameHostOnly bool `json:"sameHostOnly"`
}
type Stats struct {
OK int `json:"ok"`
Broken int `json:"broken"`
Status2xx int `json:"status2xx"`
Status3xx int `json:"status3xx"`
Status4xx int `json:"status4xx"`
Status5xx int `json:"status5xx"`
StatusOther int `json:"statusOther"`
}
type DomainCount struct {
Domain string `json:"domain"`
Count int `json:"count"`
}
type PageMeta struct {
Title string `json:"title"`
ResponseTimeMs int64 `json:"responseTimeMs"`
ContentLength int `json:"contentLength"`
Depth int `json:"depth"`
}
type RobotsSummary struct {
Present bool `json:"present"`
FetchedAt string `json:"fetchedAt,omitempty"`
}
func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}, meta Metadata, params Params, pages map[string]PageMeta, robots RobotsSummary) Report {
crawledList := sanitizeURLs(keys(crawled))
sitemapList := sanitizeURLs(keys(sitemap))
func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}) Report {
crawledList := keys(crawled)
sitemapList := keys(sitemap)
crawlErrMap := make(map[string]string, len(crawlErrs))
for k, v := range crawlErrs {
crawlErrMap[k] = v.Error()
}
missing := difference(crawled, sitemap)
missingList := sanitizeURLs(keys(missing))
missingList := keys(missing)
inSmNotCrawled := difference(sitemap, crawled)
inSmNotCrawledList := sanitizeURLs(keys(inSmNotCrawled))
inSmNotCrawledList := keys(inSmNotCrawled)
pageOut := make(map[string][]string, len(outlinks))
linkSrc := make(map[string][]string)
for page, set := range outlinks {
lst := sanitizeURLs(keys(set))
lst := keys(set)
pageOut[page] = lst
for _, u := range lst {
linkSrc[u] = append(linkSrc[u], page)
}
}
// Compute simple status histogram
var st Stats
for _, ls := range check.Statuses {
if ls.OK {
st.OK++
} else {
st.Broken++
}
switch {
case ls.StatusCode >= 200 && ls.StatusCode < 300:
st.Status2xx++
case ls.StatusCode >= 300 && ls.StatusCode < 400:
st.Status3xx++
case ls.StatusCode >= 400 && ls.StatusCode < 500:
st.Status4xx++
case ls.StatusCode >= 500 && ls.StatusCode < 600:
st.Status5xx++
default:
st.StatusOther++
}
}
// Derived summaries
tHost := hostLower(target)
extCounts := map[string]int{}
brokenByDomain := map[string]int{}
var brokenSample []linkcheck.LinkStatus
for _, ls := range check.Statuses {
h := hostLower(ls.URL)
if h != "" && !strings.EqualFold(h, tHost) {
extCounts[h]++
}
if !ls.OK {
brokenByDomain[h]++
if len(brokenSample) < 10 {
brokenSample = append(brokenSample, ls)
}
}
}
topExternal := mapToSortedSlice(extCounts)
brokenBy := mapToSortedSlice(brokenByDomain)
// Depth distribution
depthDist := make(map[int]int)
for _, pm := range pages {
depthDist[pm.Depth]++
}
summary := fmt.Sprintf("crawled=%d sitemap=%d links=%d ok=%d broken=%d",
len(crawledList), len(sitemapList), len(check.Statuses), st.OK, st.Broken)
return Report{
Target: target,
CrawledURLs: crawledList,
@ -157,16 +53,6 @@ func Build(target string, crawled map[string]struct{}, sitemap map[string]struct
LinkSources: linkSrc,
MissingInSitemap: missingList,
InSitemapNotCrawled: inSmNotCrawledList,
Metadata: meta,
Params: params,
Stats: st,
ReportSummary: summary,
TopExternalDomains: topExternal,
BrokenSample: brokenSample,
BrokenByDomain: brokenBy,
Pages: pages,
DepthDistribution: depthDist,
Robots: robots,
}
}
@ -208,48 +94,3 @@ func difference(a, b map[string]struct{}) map[string]struct{} {
}
return res
}
// sanitizeURLs normalizes small variants like trailing "/." to "/" for consistency.
func sanitizeURLs(urls []string) []string {
out := make([]string, 0, len(urls))
for _, u := range urls {
out = append(out, sanitizeURL(u))
}
sort.Strings(out)
return out
}
func sanitizeURL(u string) string {
if len(u) >= 2 && u[len(u)-2:] == "/." {
return u[:len(u)-1]
}
return u
}
func hostLower(raw string) string {
u, err := url.Parse(raw)
if err != nil {
return ""
}
return strings.ToLower(u.Host)
}
func mapToSortedSlice(m map[string]int) []DomainCount {
if len(m) == 0 {
return nil
}
out := make([]DomainCount, 0, len(m))
for k, v := range m {
out = append(out, DomainCount{Domain: k, Count: v})
}
sort.Slice(out, func(i, j int) bool {
if out[i].Count == out[j].Count {
return out[i].Domain < out[j].Domain
}
return out[i].Count > out[j].Count
})
if len(out) > 10 {
out = out[:10]
}
return out
}

203
main.go
View File

@ -2,15 +2,12 @@ package main
import (
"context"
"encoding/csv"
"encoding/json"
"errors"
"flag"
"fmt"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"sync/atomic"
"time"
@ -19,7 +16,6 @@ import (
"urlcrawler/internal/linkcheck"
"urlcrawler/internal/report"
"urlcrawler/internal/sitemap"
"urlcrawler/internal/urlutil"
)
func main() {
@ -31,7 +27,6 @@ func main() {
var sameHostOnly bool
var output string
var quiet bool
var exportDir string
flag.StringVar(&target, "target", "", "Target site URL (e.g., https://example.com)")
flag.IntVar(&concurrency, "concurrency", 10, "Number of concurrent workers")
@ -41,7 +36,6 @@ func main() {
flag.BoolVar(&sameHostOnly, "same-host-only", true, "Limit crawl to the same host as target")
flag.StringVar(&output, "output", "text", "Output format: text|json")
flag.BoolVar(&quiet, "quiet", false, "Suppress progress output")
flag.StringVar(&exportDir, "export-dir", "exports", "Directory to write CSV/NDJSON exports into (set empty to disable)")
flag.Parse()
if strings.TrimSpace(target) == "" {
@ -53,17 +47,6 @@ func main() {
client := &http.Client{Timeout: timeout}
ctx := context.Background()
// Report metadata
started := time.Now()
meta := report.Metadata{StartedAt: started.UTC().Format(time.RFC3339)}
params := report.Params{
MaxDepth: maxDepth,
Concurrency: concurrency,
TimeoutMs: timeout.Milliseconds(),
UserAgent: userAgent,
SameHostOnly: sameHostOnly,
}
fmt.Fprintf(os.Stderr, "Starting crawl of %s (depth: %d)...\n", target, maxDepth)
// Setup progress counters
@ -105,7 +88,7 @@ func main() {
currentURL.Store(u)
}
visited, crawlErrs, outlinks, pageInfo := crawler.Crawl(ctx, target, maxDepth, concurrency, sameHostOnly, client, userAgent, visitedCallback, errorCallback)
visited, crawlErrs, outlinks := crawler.Crawl(ctx, target, maxDepth, concurrency, sameHostOnly, client, userAgent, visitedCallback, errorCallback)
// Clear progress line before moving to next phase
if !quiet {
@ -119,22 +102,6 @@ func main() {
fmt.Fprintf(os.Stderr, "sitemap error: %v\n", err)
}
// Robots.txt summary (simple)
robots := report.RobotsSummary{}
robotsURL := urlutil.Origin(target) + "/robots.txt"
{
req, _ := http.NewRequestWithContext(ctx, http.MethodGet, robotsURL, nil)
req.Header.Set("User-Agent", userAgent)
resp, err := client.Do(req)
if err == nil {
defer resp.Body.Close()
if resp.StatusCode == http.StatusOK {
robots.Present = true
robots.FetchedAt = time.Now().UTC().Format(time.RFC3339)
}
}
}
// Build set of all unique links discovered across pages for status checks
allLinks := make(map[string]struct{})
for _, m := range outlinks {
@ -170,35 +137,8 @@ func main() {
urlsVisited.Load(), urlsErrored.Load())
}
finished := time.Now()
meta.FinishedAt = finished.UTC().Format(time.RFC3339)
meta.DurationMs = finished.Sub(started).Milliseconds()
fmt.Fprintf(os.Stderr, "Building report...\n")
// Convert pageInfo to report.PageMeta
pages := make(map[string]report.PageMeta, len(pageInfo))
for u, pi := range pageInfo {
pages[u] = report.PageMeta{
Title: pi.Title,
ResponseTimeMs: pi.ResponseTimeMs,
ContentLength: pi.ContentLength,
Depth: pi.Depth,
}
}
reports := report.Build(target, visited, smURLs, crawlErrs, checkResults, outlinks, meta, params, pages, robots)
if exportDir != "" {
if err := exportAll(exportDir, reports); err != nil {
fmt.Fprintf(os.Stderr, "export error: %v\n", err)
}
}
// Save JSON report to ./reports/<host>.json by default (ignored by git)
if err := saveReportJSON("reports", reports); err != nil {
fmt.Fprintf(os.Stderr, "save report error: %v\n", err)
}
reports := report.Build(target, visited, smURLs, crawlErrs, checkResults, outlinks)
switch output {
case "json":
@ -220,142 +160,3 @@ func truncateForTTY(s string, max int) string {
}
return s[:max-1] + "…"
}
func exportAll(baseDir string, r report.Report) error {
u, err := url.Parse(r.Target)
if err != nil || u.Host == "" {
return fmt.Errorf("invalid target for export: %s", r.Target)
}
dir := filepath.Join(baseDir, u.Host)
if err := os.MkdirAll(dir, 0o755); err != nil {
return err
}
if err := exportCSVPages(filepath.Join(dir, "pages.csv"), r); err != nil {
return err
}
if err := exportCSVLinks(filepath.Join(dir, "links.csv"), r); err != nil {
return err
}
if err := exportNDJSON(filepath.Join(dir, "pages.ndjson"), pagesToNDJSON(r)); err != nil {
return err
}
if err := exportNDJSON(filepath.Join(dir, "links.ndjson"), linksToNDJSON(r)); err != nil {
return err
}
if err := exportNDJSON(filepath.Join(dir, "link_statuses.ndjson"), linkStatusesToNDJSON(r)); err != nil {
return err
}
return nil
}
func exportCSVPages(path string, r report.Report) error {
f, err := os.Create(path)
if err != nil {
return err
}
defer f.Close()
w := csv.NewWriter(f)
defer w.Flush()
_ = w.Write([]string{"url", "title", "responseTimeMs", "contentLength", "depth"})
for u, pm := range r.Pages {
rec := []string{u, pm.Title, fmt.Sprintf("%d", pm.ResponseTimeMs), fmt.Sprintf("%d", pm.ContentLength), fmt.Sprintf("%d", pm.Depth)}
_ = w.Write(rec)
}
return w.Error()
}
func exportCSVLinks(path string, r report.Report) error {
f, err := os.Create(path)
if err != nil {
return err
}
defer f.Close()
w := csv.NewWriter(f)
defer w.Flush()
_ = w.Write([]string{"sourceUrl", "targetUrl"})
for src, lst := range r.PageOutlinks {
for _, dst := range lst {
_ = w.Write([]string{src, dst})
}
}
return w.Error()
}
type ndjsonItem interface{}
func exportNDJSON(path string, items []ndjsonItem) error {
f, err := os.Create(path)
if err != nil {
return err
}
defer f.Close()
enc := json.NewEncoder(f)
for _, it := range items {
if err := enc.Encode(it); err != nil {
return err
}
}
return nil
}
func pagesToNDJSON(r report.Report) []ndjsonItem {
res := make([]ndjsonItem, 0, len(r.Pages))
for u, pm := range r.Pages {
res = append(res, map[string]any{
"type": "page",
"url": u,
"title": pm.Title,
"responseTimeMs": pm.ResponseTimeMs,
"contentLength": pm.ContentLength,
"depth": pm.Depth,
})
}
return res
}
func linksToNDJSON(r report.Report) []ndjsonItem {
var res []ndjsonItem
for src, lst := range r.PageOutlinks {
for _, dst := range lst {
res = append(res, map[string]any{
"type": "link",
"src": src,
"dest": dst,
})
}
}
return res
}
func linkStatusesToNDJSON(r report.Report) []ndjsonItem {
res := make([]ndjsonItem, 0, len(r.LinkStatuses))
for _, ls := range r.LinkStatuses {
res = append(res, map[string]any{
"type": "link_status",
"url": ls.URL,
"statusCode": ls.StatusCode,
"ok": ls.OK,
"error": ls.Err,
})
}
return res
}
func saveReportJSON(baseDir string, r report.Report) error {
u, err := url.Parse(r.Target)
if err != nil || u.Host == "" {
return fmt.Errorf("invalid target for save: %s", r.Target)
}
if err := os.MkdirAll(baseDir, 0o755); err != nil {
return err
}
path := filepath.Join(baseDir, u.Host+".json")
f, err := os.Create(path)
if err != nil {
return err
}
defer f.Close()
enc := json.NewEncoder(f)
enc.SetIndent("", " ")
return enc.Encode(r)
}

59
reports/REPORT_SCHEMA.md Normal file
View File

@ -0,0 +1,59 @@
## URLCrawler Report JSON Schema
This document describes the structure of the JSON reports produced by `urlcrawler` when run with `-output json`.
### Top-level object
```json
{
"target": "https://example.com",
"crawledUrls": ["https://example.com", "https://example.com/about"],
"sitemapUrls": ["https://example.com", "https://example.com/about"],
"crawlErrors": {"https://bad.example": "error string"},
"linkStatuses": [
{"url": "https://example.com", "statusCode": 200, "ok": true},
{"url": "https://other.example/broken", "statusCode": 404, "ok": false, "error": "..."}
],
"pageOutlinks": {
"https://example.com": ["https://example.com/about", "https://other.example/"]
},
"linkSources": {
"https://example.com/about": ["https://example.com"]
},
"missingInSitemap": ["https://example.com/page-not-in-sitemap"],
"inSitemapNotCrawled": ["https://example.com/deferred"]
}
```
### Fields
- **target** (string): Normalized start URL used for the crawl.
- **crawledUrls** (string[]): Unique URLs that were visited during crawling. Sorted for stability.
- **sitemapUrls** (string[]; optional): All URLs discovered via `sitemap.xml` (and nested sitemaps). Present unless the sitemap is not found.
- **crawlErrors** (object map<string,string>; optional): Maps URL → error message for requests that failed (e.g., network/TLS/timeouts). Only set when errors occurred.
- **linkStatuses** (LinkStatus[]): Result of HTTP status checks for all unique links discovered (including the pages themselves).
- **url** (string): The checked URL.
- **statusCode** (number): HTTP status code (0 if request failed before a response was received).
- **ok** (boolean): Convenience flag, true when `200 ≤ statusCode < 400` and no error occurred.
- **error** (string; optional): Error string when a request failed or there was another client error.
- **pageOutlinks** (object map<string,string[]>): For each crawled page URL, the list of normalized outgoing links (internal and external).
- **linkSources** (object map<string,string[]>): Inverse index: for each discovered link URL, the list of page URLs where it appeared.
- **missingInSitemap** (string[]; optional): URLs that were crawled but not present in the sitemap.
- **inSitemapNotCrawled** (string[]; optional): URLs present in the sitemap that were not crawled (e.g., due to depth limits or off-host rules).
### Notes
- URLs are normalized and deduplicated during crawl.
- Content-type filtering: only `text/html` pages are parsed for outlinks.
- Sitemap fetching is best-effort; absence is not treated as an error.
- The JSON lists are sorted to produce stable outputs across runs.

View File

@ -0,0 +1,290 @@
{
"target": "https://titan-training.ca",
"crawledUrls": [
"https://titan-training.ca",
"https://titan-training.ca/",
"https://titan-training.ca/.",
"https://titan-training.ca/cdn-cgi/l/email-protection",
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
"https://titan-training.ca/products-list",
"https://titan-training.ca/titan-training.ca"
],
"sitemapUrls": [
"https://titan-training.ca/home",
"https://titan-training.ca/test_path?item=123"
],
"linkStatuses": [
{
"url": "https://titan-training.ca/products-list",
"statusCode": 200,
"ok": true
},
{
"url": "https://titan-training.ca/",
"statusCode": 200,
"ok": true
},
{
"url": "https://www.cloudflare.com/5xx-error-landing",
"statusCode": 200,
"ok": true
},
{
"url": "https://titan-training.ca",
"statusCode": 200,
"ok": true
},
{
"url": "https://www.facebook.com/titantrainingkw",
"statusCode": 200,
"ok": true
},
{
"url": "https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public",
"statusCode": 200,
"ok": true
},
{
"url": "https://developers.cloudflare.com/waf/tools/scrape-shield/email-address-obfuscation",
"statusCode": 200,
"ok": true
},
{
"url": "https://developers.cloudflare.com/fundamentals/setup/account/create-account",
"statusCode": 200,
"ok": true
},
{
"url": "https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
"statusCode": 200,
"ok": true
},
{
"url": "https://www.cloudflare.com/sign-up?utm_source=email_protection",
"statusCode": 403,
"ok": false
},
{
"url": "https://titan-training.ca/cdn-cgi/l/email-protection",
"statusCode": 200,
"ok": true
},
{
"url": "https://titan-training.ca/titan-training.ca",
"statusCode": 200,
"ok": true
},
{
"url": "https://titan-training.ca/product-details/product/681331db52e2115c63435275",
"statusCode": 200,
"ok": true
},
{
"url": "https://titan-training.ca/.",
"statusCode": 200,
"ok": true
},
{
"url": "https://www.google.com/maps/search?api=1\u0026query=Google\u0026query_place_id=ChIJwzSnW430K4gRU8zOBshqKAg",
"statusCode": 404,
"ok": false
},
{
"url": "https://www.instagram.com/titan__training",
"statusCode": 200,
"ok": true
},
{
"url": "https://titan-training.ca/product-details/product/titan-training.ca",
"statusCode": 200,
"ok": true
}
],
"pageOutlinks": {
"https://titan-training.ca": [
"https://titan-training.ca/.",
"https://titan-training.ca/cdn-cgi/l/email-protection",
"https://titan-training.ca/products-list",
"https://titan-training.ca/titan-training.ca",
"https://www.facebook.com/titantrainingkw",
"https://www.google.com/maps/search?api=1\u0026query=Google\u0026query_place_id=ChIJwzSnW430K4gRU8zOBshqKAg",
"https://www.instagram.com/titan__training",
"https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public"
],
"https://titan-training.ca/": [
"https://titan-training.ca/",
"https://titan-training.ca/cdn-cgi/l/email-protection",
"https://titan-training.ca/products-list",
"https://titan-training.ca/titan-training.ca",
"https://www.facebook.com/titantrainingkw",
"https://www.google.com/maps/search?api=1\u0026query=Google\u0026query_place_id=ChIJwzSnW430K4gRU8zOBshqKAg",
"https://www.instagram.com/titan__training",
"https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public"
],
"https://titan-training.ca/.": [
"https://titan-training.ca/",
"https://titan-training.ca/cdn-cgi/l/email-protection",
"https://titan-training.ca/products-list",
"https://titan-training.ca/titan-training.ca",
"https://www.facebook.com/titantrainingkw",
"https://www.google.com/maps/search?api=1\u0026query=Google\u0026query_place_id=ChIJwzSnW430K4gRU8zOBshqKAg",
"https://www.instagram.com/titan__training",
"https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public"
],
"https://titan-training.ca/cdn-cgi/l/email-protection": [
"https://developers.cloudflare.com/fundamentals/setup/account/create-account",
"https://developers.cloudflare.com/waf/tools/scrape-shield/email-address-obfuscation",
"https://www.cloudflare.com/5xx-error-landing",
"https://www.cloudflare.com/sign-up?utm_source=email_protection"
],
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8": [
"https://titan-training.ca/",
"https://titan-training.ca/cdn-cgi/l/email-protection",
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
"https://titan-training.ca/product-details/product/titan-training.ca",
"https://titan-training.ca/products-list",
"https://www.facebook.com/titantrainingkw",
"https://www.instagram.com/titan__training",
"https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public"
],
"https://titan-training.ca/product-details/product/681331db52e2115c63435275": [
"https://titan-training.ca/",
"https://titan-training.ca/cdn-cgi/l/email-protection",
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
"https://titan-training.ca/product-details/product/titan-training.ca",
"https://titan-training.ca/products-list",
"https://www.facebook.com/titantrainingkw",
"https://www.instagram.com/titan__training",
"https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public"
],
"https://titan-training.ca/products-list": [
"https://titan-training.ca/",
"https://titan-training.ca/cdn-cgi/l/email-protection",
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
"https://titan-training.ca/products-list",
"https://titan-training.ca/titan-training.ca",
"https://www.facebook.com/titantrainingkw",
"https://www.instagram.com/titan__training",
"https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public"
],
"https://titan-training.ca/titan-training.ca": [
"https://titan-training.ca/cdn-cgi/l/email-protection",
"https://titan-training.ca/products-list",
"https://titan-training.ca/titan-training.ca",
"https://www.facebook.com/titantrainingkw",
"https://www.google.com/maps/search?api=1\u0026query=Google\u0026query_place_id=ChIJwzSnW430K4gRU8zOBshqKAg",
"https://www.instagram.com/titan__training",
"https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public"
]
},
"linkSources": {
"https://developers.cloudflare.com/fundamentals/setup/account/create-account": [
"https://titan-training.ca/cdn-cgi/l/email-protection"
],
"https://developers.cloudflare.com/waf/tools/scrape-shield/email-address-obfuscation": [
"https://titan-training.ca/cdn-cgi/l/email-protection"
],
"https://titan-training.ca/": [
"https://titan-training.ca/",
"https://titan-training.ca/.",
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
"https://titan-training.ca/products-list"
],
"https://titan-training.ca/.": [
"https://titan-training.ca"
],
"https://titan-training.ca/cdn-cgi/l/email-protection": [
"https://titan-training.ca/",
"https://titan-training.ca/.",
"https://titan-training.ca/titan-training.ca",
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
"https://titan-training.ca",
"https://titan-training.ca/products-list"
],
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8": [
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
"https://titan-training.ca/products-list"
],
"https://titan-training.ca/product-details/product/681331db52e2115c63435275": [
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
"https://titan-training.ca/products-list"
],
"https://titan-training.ca/product-details/product/titan-training.ca": [
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
"https://titan-training.ca/product-details/product/681331db52e2115c63435275"
],
"https://titan-training.ca/products-list": [
"https://titan-training.ca/",
"https://titan-training.ca/.",
"https://titan-training.ca/titan-training.ca",
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
"https://titan-training.ca",
"https://titan-training.ca/products-list"
],
"https://titan-training.ca/titan-training.ca": [
"https://titan-training.ca/",
"https://titan-training.ca/.",
"https://titan-training.ca/titan-training.ca",
"https://titan-training.ca",
"https://titan-training.ca/products-list"
],
"https://www.cloudflare.com/5xx-error-landing": [
"https://titan-training.ca/cdn-cgi/l/email-protection"
],
"https://www.cloudflare.com/sign-up?utm_source=email_protection": [
"https://titan-training.ca/cdn-cgi/l/email-protection"
],
"https://www.facebook.com/titantrainingkw": [
"https://titan-training.ca/",
"https://titan-training.ca/.",
"https://titan-training.ca/titan-training.ca",
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
"https://titan-training.ca",
"https://titan-training.ca/products-list"
],
"https://www.google.com/maps/search?api=1\u0026query=Google\u0026query_place_id=ChIJwzSnW430K4gRU8zOBshqKAg": [
"https://titan-training.ca/",
"https://titan-training.ca/.",
"https://titan-training.ca/titan-training.ca",
"https://titan-training.ca"
],
"https://www.instagram.com/titan__training": [
"https://titan-training.ca/",
"https://titan-training.ca/.",
"https://titan-training.ca/titan-training.ca",
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
"https://titan-training.ca",
"https://titan-training.ca/products-list"
],
"https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public": [
"https://titan-training.ca/",
"https://titan-training.ca/.",
"https://titan-training.ca/titan-training.ca",
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
"https://titan-training.ca",
"https://titan-training.ca/products-list"
]
},
"missingInSitemap": [
"https://titan-training.ca",
"https://titan-training.ca/",
"https://titan-training.ca/.",
"https://titan-training.ca/cdn-cgi/l/email-protection",
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
"https://titan-training.ca/products-list",
"https://titan-training.ca/titan-training.ca"
],
"inSitemapNotCrawled": [
"https://titan-training.ca/home",
"https://titan-training.ca/test_path?item=123"
]
}