Compare commits
9 Commits
Author | SHA1 | Date |
---|---|---|
|
bbb7808d1f | |
|
a3d277488f | |
|
deb54fdb6b | |
|
312c09d825 | |
|
f4b2fab683 | |
|
95041372e6 | |
|
e80e0be97f | |
|
89adad8ad8 | |
|
3eb9ab48bf |
|
@ -0,0 +1,11 @@
|
||||||
|
# Binaries (root)
|
||||||
|
urlcrawler
|
||||||
|
|
||||||
|
# Local reports and exports (ignore by default)
|
||||||
|
reports/
|
||||||
|
exports/
|
||||||
|
|
||||||
|
# Misc
|
||||||
|
.DS_Store
|
||||||
|
*.log
|
||||||
|
|
57
TODO.md
57
TODO.md
|
@ -1,45 +1,28 @@
|
||||||
## Roadmap (post v0.0.1)
|
## Roadmap (post v0.0.2)
|
||||||
|
|
||||||
Prioritized from easiest/low-risk to more involved work. Check off as we ship.
|
Prioritized from easiest/low-risk to more involved work. Check off as we ship.
|
||||||
|
|
||||||
### Quick wins (target v0.0.2)
|
### Shipped in v0.0.2
|
||||||
- [ ] Add crawl metadata (startedAt, finishedAt, durationMs)
|
- [x] Add crawl metadata (startedAt, finishedAt, durationMs)
|
||||||
- [ ] Include run parameters in report (maxDepth, concurrency, timeout, userAgent, sameHostOnly)
|
- [x] Include run parameters in report (maxDepth, concurrency, timeout, userAgent, sameHostOnly)
|
||||||
- [ ] Status histogram (2xx/3xx/4xx/5xx totals) in summary
|
- [x] Status histogram (2xx/3xx/4xx/5xx totals) in summary
|
||||||
- [ ] Normalize and dedupe trailing `/.` URL variants in output
|
- [x] Normalize and dedupe trailing `/.` URL variants in output
|
||||||
- [ ] Add compact `reportSummary` text block to JSON
|
- [x] Add compact `reportSummary` text block to JSON
|
||||||
- [ ] Top external domains with counts
|
- [x] Top external domains with counts
|
||||||
- [ ] Broken links sample (first N) + per-domain broken counts
|
- [x] Broken links sample (first N) + per-domain broken counts
|
||||||
|
- [x] Robots.txt summary (present, fetchedAt)
|
||||||
|
- [x] Sitemap extras (index → child sitemaps, fetch errors)
|
||||||
|
- [x] Per-page response time (responseTimeMs) and content length (basic)
|
||||||
|
- [x] Basic page metadata: `<title>`
|
||||||
|
- [x] Depth distribution (count of pages by depth)
|
||||||
|
- [x] Redirect map summary (from → to domain counts)
|
||||||
|
|
||||||
### Moderate scope
|
### Next (target v0.0.3)
|
||||||
- [ ] Robots.txt summary (present, fetchedAt, sample disallow rules)
|
- [x] CSV exports: pages.csv, links.csv
|
||||||
- [ ] Sitemap extras (index → child sitemaps, fetch errors)
|
- [x] NDJSON export option for streaming pipelines
|
||||||
- [ ] Per-page response time (responseTimeMs) and content length
|
|
||||||
- [ ] Basic page metadata: `<title>`, canonical (if present)
|
|
||||||
- [ ] Depth distribution (count of pages by depth)
|
|
||||||
- [ ] Duplicate title/canonical detection (lists of URLs)
|
|
||||||
|
|
||||||
### Content/asset analysis
|
|
||||||
- [ ] Extract assets (images/css/js) per page with status/type/size
|
|
||||||
- [ ] Mixed-content detection (http assets on https pages)
|
|
||||||
- [ ] Image accessibility metric (alt present ratio)
|
|
||||||
|
|
||||||
### Security and quality signals
|
|
||||||
- [ ] Security headers by host (HSTS, CSP, X-Frame-Options, Referrer-Policy)
|
|
||||||
- [ ] Insecure forms (http action on https page)
|
|
||||||
- [ ] Large pages and slow pages (p95 thresholds) summary
|
|
||||||
|
|
||||||
### Link behavior and graph
|
|
||||||
- [ ] Redirect map (from → to, hops; count summary)
|
|
||||||
- [ ] Indegree/outdegree stats; small graph summary
|
|
||||||
|
|
||||||
### Outputs and UX
|
|
||||||
- [ ] CSV exports: pages.csv, links.csv, assets.csv
|
|
||||||
- [ ] NDJSON export option for streaming pipelines
|
|
||||||
- [ ] Optional: include file/line anchors in JSON for large outputs
|
|
||||||
|
|
||||||
### Notes
|
### Notes
|
||||||
- Keep JSON stable and sorted; avoid breaking changes. If we change fields, bump minor version and document in `reports/REPORT_SCHEMA.md`.
|
- All report metrics must be gathered by default with zero flags required.
|
||||||
- Favor opt-in flags for heavier analyses (assets, headers) to keep default runs fast.
|
- Keep JSON stable and sorted; update `reports/REPORT_SCHEMA.md` when fields change.
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,24 @@
|
||||||
|
#!/bin/bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
APP=urlcrawler
|
||||||
|
DIST_DIR=dist
|
||||||
|
|
||||||
|
rm -rf "$DIST_DIR"
|
||||||
|
mkdir -p "$DIST_DIR"
|
||||||
|
|
||||||
|
echo "Building $APP for darwin/amd64..."
|
||||||
|
GOOS=darwin GOARCH=amd64 go build -o "$DIST_DIR/${APP}-darwin-amd64"
|
||||||
|
|
||||||
|
echo "Building $APP for darwin/arm64..."
|
||||||
|
GOOS=darwin GOARCH=arm64 go build -o "$DIST_DIR/${APP}-darwin-arm64"
|
||||||
|
|
||||||
|
echo "Building $APP for linux/amd64..."
|
||||||
|
GOOS=linux GOARCH=amd64 go build -o "$DIST_DIR/${APP}-linux-amd64"
|
||||||
|
|
||||||
|
echo "Building $APP for linux/arm64..."
|
||||||
|
GOOS=linux GOARCH=arm64 go build -o "$DIST_DIR/${APP}-linux-arm64"
|
||||||
|
|
||||||
|
echo "Done. Artifacts in $DIST_DIR/"
|
||||||
|
|
||||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -5,6 +5,7 @@ import (
|
||||||
"io"
|
"io"
|
||||||
"net/http"
|
"net/http"
|
||||||
"sync"
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
"urlcrawler/internal/htmlx"
|
"urlcrawler/internal/htmlx"
|
||||||
"urlcrawler/internal/urlutil"
|
"urlcrawler/internal/urlutil"
|
||||||
|
@ -15,14 +16,22 @@ type task struct {
|
||||||
depth int
|
depth int
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type PageInfo struct {
|
||||||
|
Title string
|
||||||
|
ResponseTimeMs int64
|
||||||
|
ContentLength int
|
||||||
|
Depth int
|
||||||
|
}
|
||||||
|
|
||||||
// Crawl visits pages up to maxDepth and returns visited set, per-URL errors, and per-page outgoing links.
|
// Crawl visits pages up to maxDepth and returns visited set, per-URL errors, and per-page outgoing links.
|
||||||
// The visitedCallback and errorCallback functions are called when a page is successfully visited or encounters an error.
|
// The visitedCallback and errorCallback functions are called when a page is successfully visited or encounters an error.
|
||||||
// visitedCallback receives the URL, its depth, and the current number of pending tasks in the queue.
|
// visitedCallback receives the URL, its depth, and the current number of pending tasks in the queue.
|
||||||
// errorCallback receives the URL, the error, and the current number of pending tasks in the queue.
|
// errorCallback receives the URL, the error, and the current number of pending tasks in the queue.
|
||||||
func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int, sameHostOnly bool, client *http.Client, userAgent string, visitedCallback func(string, int, int), errorCallback func(string, error, int)) (map[string]struct{}, map[string]error, map[string]map[string]struct{}) {
|
func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int, sameHostOnly bool, client *http.Client, userAgent string, visitedCallback func(string, int, int), errorCallback func(string, error, int)) (map[string]struct{}, map[string]error, map[string]map[string]struct{}, map[string]PageInfo) {
|
||||||
visited := make(map[string]struct{})
|
visited := make(map[string]struct{})
|
||||||
errs := make(map[string]error)
|
errs := make(map[string]error)
|
||||||
outlinks := make(map[string]map[string]struct{})
|
outlinks := make(map[string]map[string]struct{})
|
||||||
|
pageInfos := make(map[string]PageInfo)
|
||||||
var mu sync.Mutex
|
var mu sync.Mutex
|
||||||
|
|
||||||
origin := urlutil.Origin(startURL)
|
origin := urlutil.Origin(startURL)
|
||||||
|
@ -56,12 +65,14 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
|
||||||
visitedCallback(tk.url, tk.depth, len(tasks))
|
visitedCallback(tk.url, tk.depth, len(tasks))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
req, _ := http.NewRequestWithContext(ctx, http.MethodGet, tk.url, nil)
|
req, _ := http.NewRequestWithContext(ctx, http.MethodGet, tk.url, nil)
|
||||||
req.Header.Set("User-Agent", userAgent)
|
req.Header.Set("User-Agent", userAgent)
|
||||||
resp, err := client.Do(req)
|
resp, err := client.Do(req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
mu.Lock()
|
mu.Lock()
|
||||||
errs[tk.url] = err
|
errs[tk.url] = err
|
||||||
|
pageInfos[tk.url] = PageInfo{Title: "", ResponseTimeMs: time.Since(start).Milliseconds(), ContentLength: 0, Depth: tk.depth}
|
||||||
mu.Unlock()
|
mu.Unlock()
|
||||||
|
|
||||||
if errorCallback != nil {
|
if errorCallback != nil {
|
||||||
|
@ -73,10 +84,20 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
|
||||||
func() {
|
func() {
|
||||||
defer resp.Body.Close()
|
defer resp.Body.Close()
|
||||||
ct := resp.Header.Get("Content-Type")
|
ct := resp.Header.Get("Content-Type")
|
||||||
|
// Default meta values
|
||||||
|
meta := PageInfo{Title: "", ResponseTimeMs: time.Since(start).Milliseconds(), ContentLength: 0, Depth: tk.depth}
|
||||||
|
if resp.ContentLength > 0 {
|
||||||
|
meta.ContentLength = int(resp.ContentLength)
|
||||||
|
}
|
||||||
if resp.StatusCode != http.StatusOK || ct == "" || (ct != "text/html" && !hasPrefix(ct, "text/html")) {
|
if resp.StatusCode != http.StatusOK || ct == "" || (ct != "text/html" && !hasPrefix(ct, "text/html")) {
|
||||||
|
mu.Lock()
|
||||||
|
pageInfos[tk.url] = meta
|
||||||
|
mu.Unlock()
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
body, _ := io.ReadAll(resp.Body)
|
body, _ := io.ReadAll(resp.Body)
|
||||||
|
meta.ContentLength = len(body)
|
||||||
|
meta.Title = htmlx.ExtractTitle(stringsReader(string(body)))
|
||||||
hrefs := htmlx.ExtractAnchors(stringsReader(string(body)))
|
hrefs := htmlx.ExtractAnchors(stringsReader(string(body)))
|
||||||
var toEnqueue []string
|
var toEnqueue []string
|
||||||
for _, href := range hrefs {
|
for _, href := range hrefs {
|
||||||
|
@ -102,6 +123,9 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
|
||||||
for _, u := range toEnqueue {
|
for _, u := range toEnqueue {
|
||||||
enqueue(task{url: u, depth: tk.depth + 1})
|
enqueue(task{url: u, depth: tk.depth + 1})
|
||||||
}
|
}
|
||||||
|
mu.Lock()
|
||||||
|
pageInfos[tk.url] = meta
|
||||||
|
mu.Unlock()
|
||||||
}()
|
}()
|
||||||
wgTasks.Done()
|
wgTasks.Done()
|
||||||
}
|
}
|
||||||
|
@ -121,7 +145,7 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
|
||||||
enqueue(task{url: startURL, depth: 0})
|
enqueue(task{url: startURL, depth: 0})
|
||||||
wgWorkers.Wait()
|
wgWorkers.Wait()
|
||||||
|
|
||||||
return visited, errs, outlinks
|
return visited, errs, outlinks, pageInfos
|
||||||
}
|
}
|
||||||
|
|
||||||
func hasPrefix(s string, prefix string) bool {
|
func hasPrefix(s string, prefix string) bool {
|
||||||
|
|
|
@ -36,3 +36,22 @@ func ExtractAnchors(r io.Reader) []string {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ExtractTitle returns the text content of the first <title> element.
|
||||||
|
func ExtractTitle(r io.Reader) string {
|
||||||
|
tokens := html.NewTokenizer(r)
|
||||||
|
for {
|
||||||
|
switch tokens.Next() {
|
||||||
|
case html.StartTagToken:
|
||||||
|
name, _ := tokens.TagName()
|
||||||
|
if string(name) == "title" {
|
||||||
|
if tokens.Next() == html.TextToken {
|
||||||
|
t := strings.TrimSpace(string(tokens.Text()))
|
||||||
|
return t
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case html.ErrorToken:
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -3,7 +3,9 @@ package report
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
|
"net/url"
|
||||||
"sort"
|
"sort"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"urlcrawler/internal/linkcheck"
|
"urlcrawler/internal/linkcheck"
|
||||||
)
|
)
|
||||||
|
@ -18,31 +20,133 @@ type Report struct {
|
||||||
LinkSources map[string][]string `json:"linkSources"`
|
LinkSources map[string][]string `json:"linkSources"`
|
||||||
MissingInSitemap []string `json:"missingInSitemap,omitempty"`
|
MissingInSitemap []string `json:"missingInSitemap,omitempty"`
|
||||||
InSitemapNotCrawled []string `json:"inSitemapNotCrawled,omitempty"`
|
InSitemapNotCrawled []string `json:"inSitemapNotCrawled,omitempty"`
|
||||||
|
Metadata Metadata `json:"metadata"`
|
||||||
|
Params Params `json:"params"`
|
||||||
|
Stats Stats `json:"stats"`
|
||||||
|
ReportSummary string `json:"reportSummary,omitempty"`
|
||||||
|
TopExternalDomains []DomainCount `json:"topExternalDomains,omitempty"`
|
||||||
|
BrokenSample []linkcheck.LinkStatus `json:"brokenSample,omitempty"`
|
||||||
|
BrokenByDomain []DomainCount `json:"brokenByDomain,omitempty"`
|
||||||
|
Pages map[string]PageMeta `json:"pages"`
|
||||||
|
DepthDistribution map[int]int `json:"depthDistribution"`
|
||||||
|
Robots RobotsSummary `json:"robots"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}) Report {
|
type Metadata struct {
|
||||||
crawledList := keys(crawled)
|
StartedAt string `json:"startedAt"` // RFC3339
|
||||||
sitemapList := keys(sitemap)
|
FinishedAt string `json:"finishedAt"` // RFC3339
|
||||||
|
DurationMs int64 `json:"durationMs"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type Params struct {
|
||||||
|
MaxDepth int `json:"maxDepth"`
|
||||||
|
Concurrency int `json:"concurrency"`
|
||||||
|
TimeoutMs int64 `json:"timeoutMs"`
|
||||||
|
UserAgent string `json:"userAgent"`
|
||||||
|
SameHostOnly bool `json:"sameHostOnly"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type Stats struct {
|
||||||
|
OK int `json:"ok"`
|
||||||
|
Broken int `json:"broken"`
|
||||||
|
Status2xx int `json:"status2xx"`
|
||||||
|
Status3xx int `json:"status3xx"`
|
||||||
|
Status4xx int `json:"status4xx"`
|
||||||
|
Status5xx int `json:"status5xx"`
|
||||||
|
StatusOther int `json:"statusOther"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type DomainCount struct {
|
||||||
|
Domain string `json:"domain"`
|
||||||
|
Count int `json:"count"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type PageMeta struct {
|
||||||
|
Title string `json:"title"`
|
||||||
|
ResponseTimeMs int64 `json:"responseTimeMs"`
|
||||||
|
ContentLength int `json:"contentLength"`
|
||||||
|
Depth int `json:"depth"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type RobotsSummary struct {
|
||||||
|
Present bool `json:"present"`
|
||||||
|
FetchedAt string `json:"fetchedAt,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}, meta Metadata, params Params, pages map[string]PageMeta, robots RobotsSummary) Report {
|
||||||
|
crawledList := sanitizeURLs(keys(crawled))
|
||||||
|
sitemapList := sanitizeURLs(keys(sitemap))
|
||||||
crawlErrMap := make(map[string]string, len(crawlErrs))
|
crawlErrMap := make(map[string]string, len(crawlErrs))
|
||||||
for k, v := range crawlErrs {
|
for k, v := range crawlErrs {
|
||||||
crawlErrMap[k] = v.Error()
|
crawlErrMap[k] = v.Error()
|
||||||
}
|
}
|
||||||
|
|
||||||
missing := difference(crawled, sitemap)
|
missing := difference(crawled, sitemap)
|
||||||
missingList := keys(missing)
|
missingList := sanitizeURLs(keys(missing))
|
||||||
inSmNotCrawled := difference(sitemap, crawled)
|
inSmNotCrawled := difference(sitemap, crawled)
|
||||||
inSmNotCrawledList := keys(inSmNotCrawled)
|
inSmNotCrawledList := sanitizeURLs(keys(inSmNotCrawled))
|
||||||
|
|
||||||
pageOut := make(map[string][]string, len(outlinks))
|
pageOut := make(map[string][]string, len(outlinks))
|
||||||
linkSrc := make(map[string][]string)
|
linkSrc := make(map[string][]string)
|
||||||
for page, set := range outlinks {
|
for page, set := range outlinks {
|
||||||
lst := keys(set)
|
lst := sanitizeURLs(keys(set))
|
||||||
pageOut[page] = lst
|
pageOut[page] = lst
|
||||||
for _, u := range lst {
|
for _, u := range lst {
|
||||||
linkSrc[u] = append(linkSrc[u], page)
|
linkSrc[u] = append(linkSrc[u], page)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Compute simple status histogram
|
||||||
|
var st Stats
|
||||||
|
for _, ls := range check.Statuses {
|
||||||
|
if ls.OK {
|
||||||
|
st.OK++
|
||||||
|
} else {
|
||||||
|
st.Broken++
|
||||||
|
}
|
||||||
|
switch {
|
||||||
|
case ls.StatusCode >= 200 && ls.StatusCode < 300:
|
||||||
|
st.Status2xx++
|
||||||
|
case ls.StatusCode >= 300 && ls.StatusCode < 400:
|
||||||
|
st.Status3xx++
|
||||||
|
case ls.StatusCode >= 400 && ls.StatusCode < 500:
|
||||||
|
st.Status4xx++
|
||||||
|
case ls.StatusCode >= 500 && ls.StatusCode < 600:
|
||||||
|
st.Status5xx++
|
||||||
|
default:
|
||||||
|
st.StatusOther++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Derived summaries
|
||||||
|
tHost := hostLower(target)
|
||||||
|
extCounts := map[string]int{}
|
||||||
|
brokenByDomain := map[string]int{}
|
||||||
|
var brokenSample []linkcheck.LinkStatus
|
||||||
|
for _, ls := range check.Statuses {
|
||||||
|
h := hostLower(ls.URL)
|
||||||
|
if h != "" && !strings.EqualFold(h, tHost) {
|
||||||
|
extCounts[h]++
|
||||||
|
}
|
||||||
|
if !ls.OK {
|
||||||
|
brokenByDomain[h]++
|
||||||
|
if len(brokenSample) < 10 {
|
||||||
|
brokenSample = append(brokenSample, ls)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
topExternal := mapToSortedSlice(extCounts)
|
||||||
|
brokenBy := mapToSortedSlice(brokenByDomain)
|
||||||
|
|
||||||
|
// Depth distribution
|
||||||
|
depthDist := make(map[int]int)
|
||||||
|
for _, pm := range pages {
|
||||||
|
depthDist[pm.Depth]++
|
||||||
|
}
|
||||||
|
|
||||||
|
summary := fmt.Sprintf("crawled=%d sitemap=%d links=%d ok=%d broken=%d",
|
||||||
|
len(crawledList), len(sitemapList), len(check.Statuses), st.OK, st.Broken)
|
||||||
|
|
||||||
return Report{
|
return Report{
|
||||||
Target: target,
|
Target: target,
|
||||||
CrawledURLs: crawledList,
|
CrawledURLs: crawledList,
|
||||||
|
@ -53,6 +157,16 @@ func Build(target string, crawled map[string]struct{}, sitemap map[string]struct
|
||||||
LinkSources: linkSrc,
|
LinkSources: linkSrc,
|
||||||
MissingInSitemap: missingList,
|
MissingInSitemap: missingList,
|
||||||
InSitemapNotCrawled: inSmNotCrawledList,
|
InSitemapNotCrawled: inSmNotCrawledList,
|
||||||
|
Metadata: meta,
|
||||||
|
Params: params,
|
||||||
|
Stats: st,
|
||||||
|
ReportSummary: summary,
|
||||||
|
TopExternalDomains: topExternal,
|
||||||
|
BrokenSample: brokenSample,
|
||||||
|
BrokenByDomain: brokenBy,
|
||||||
|
Pages: pages,
|
||||||
|
DepthDistribution: depthDist,
|
||||||
|
Robots: robots,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -94,3 +208,48 @@ func difference(a, b map[string]struct{}) map[string]struct{} {
|
||||||
}
|
}
|
||||||
return res
|
return res
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// sanitizeURLs normalizes small variants like trailing "/." to "/" for consistency.
|
||||||
|
func sanitizeURLs(urls []string) []string {
|
||||||
|
out := make([]string, 0, len(urls))
|
||||||
|
for _, u := range urls {
|
||||||
|
out = append(out, sanitizeURL(u))
|
||||||
|
}
|
||||||
|
sort.Strings(out)
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func sanitizeURL(u string) string {
|
||||||
|
if len(u) >= 2 && u[len(u)-2:] == "/." {
|
||||||
|
return u[:len(u)-1]
|
||||||
|
}
|
||||||
|
return u
|
||||||
|
}
|
||||||
|
|
||||||
|
func hostLower(raw string) string {
|
||||||
|
u, err := url.Parse(raw)
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return strings.ToLower(u.Host)
|
||||||
|
}
|
||||||
|
|
||||||
|
func mapToSortedSlice(m map[string]int) []DomainCount {
|
||||||
|
if len(m) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
out := make([]DomainCount, 0, len(m))
|
||||||
|
for k, v := range m {
|
||||||
|
out = append(out, DomainCount{Domain: k, Count: v})
|
||||||
|
}
|
||||||
|
sort.Slice(out, func(i, j int) bool {
|
||||||
|
if out[i].Count == out[j].Count {
|
||||||
|
return out[i].Domain < out[j].Domain
|
||||||
|
}
|
||||||
|
return out[i].Count > out[j].Count
|
||||||
|
})
|
||||||
|
if len(out) > 10 {
|
||||||
|
out = out[:10]
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
203
main.go
203
main.go
|
@ -2,12 +2,15 @@ package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"encoding/csv"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"flag"
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"net/url"
|
||||||
"os"
|
"os"
|
||||||
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
|
@ -16,6 +19,7 @@ import (
|
||||||
"urlcrawler/internal/linkcheck"
|
"urlcrawler/internal/linkcheck"
|
||||||
"urlcrawler/internal/report"
|
"urlcrawler/internal/report"
|
||||||
"urlcrawler/internal/sitemap"
|
"urlcrawler/internal/sitemap"
|
||||||
|
"urlcrawler/internal/urlutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
|
@ -27,6 +31,7 @@ func main() {
|
||||||
var sameHostOnly bool
|
var sameHostOnly bool
|
||||||
var output string
|
var output string
|
||||||
var quiet bool
|
var quiet bool
|
||||||
|
var exportDir string
|
||||||
|
|
||||||
flag.StringVar(&target, "target", "", "Target site URL (e.g., https://example.com)")
|
flag.StringVar(&target, "target", "", "Target site URL (e.g., https://example.com)")
|
||||||
flag.IntVar(&concurrency, "concurrency", 10, "Number of concurrent workers")
|
flag.IntVar(&concurrency, "concurrency", 10, "Number of concurrent workers")
|
||||||
|
@ -36,6 +41,7 @@ func main() {
|
||||||
flag.BoolVar(&sameHostOnly, "same-host-only", true, "Limit crawl to the same host as target")
|
flag.BoolVar(&sameHostOnly, "same-host-only", true, "Limit crawl to the same host as target")
|
||||||
flag.StringVar(&output, "output", "text", "Output format: text|json")
|
flag.StringVar(&output, "output", "text", "Output format: text|json")
|
||||||
flag.BoolVar(&quiet, "quiet", false, "Suppress progress output")
|
flag.BoolVar(&quiet, "quiet", false, "Suppress progress output")
|
||||||
|
flag.StringVar(&exportDir, "export-dir", "exports", "Directory to write CSV/NDJSON exports into (set empty to disable)")
|
||||||
flag.Parse()
|
flag.Parse()
|
||||||
|
|
||||||
if strings.TrimSpace(target) == "" {
|
if strings.TrimSpace(target) == "" {
|
||||||
|
@ -47,6 +53,17 @@ func main() {
|
||||||
client := &http.Client{Timeout: timeout}
|
client := &http.Client{Timeout: timeout}
|
||||||
ctx := context.Background()
|
ctx := context.Background()
|
||||||
|
|
||||||
|
// Report metadata
|
||||||
|
started := time.Now()
|
||||||
|
meta := report.Metadata{StartedAt: started.UTC().Format(time.RFC3339)}
|
||||||
|
params := report.Params{
|
||||||
|
MaxDepth: maxDepth,
|
||||||
|
Concurrency: concurrency,
|
||||||
|
TimeoutMs: timeout.Milliseconds(),
|
||||||
|
UserAgent: userAgent,
|
||||||
|
SameHostOnly: sameHostOnly,
|
||||||
|
}
|
||||||
|
|
||||||
fmt.Fprintf(os.Stderr, "Starting crawl of %s (depth: %d)...\n", target, maxDepth)
|
fmt.Fprintf(os.Stderr, "Starting crawl of %s (depth: %d)...\n", target, maxDepth)
|
||||||
|
|
||||||
// Setup progress counters
|
// Setup progress counters
|
||||||
|
@ -88,7 +105,7 @@ func main() {
|
||||||
currentURL.Store(u)
|
currentURL.Store(u)
|
||||||
}
|
}
|
||||||
|
|
||||||
visited, crawlErrs, outlinks := crawler.Crawl(ctx, target, maxDepth, concurrency, sameHostOnly, client, userAgent, visitedCallback, errorCallback)
|
visited, crawlErrs, outlinks, pageInfo := crawler.Crawl(ctx, target, maxDepth, concurrency, sameHostOnly, client, userAgent, visitedCallback, errorCallback)
|
||||||
|
|
||||||
// Clear progress line before moving to next phase
|
// Clear progress line before moving to next phase
|
||||||
if !quiet {
|
if !quiet {
|
||||||
|
@ -102,6 +119,22 @@ func main() {
|
||||||
fmt.Fprintf(os.Stderr, "sitemap error: %v\n", err)
|
fmt.Fprintf(os.Stderr, "sitemap error: %v\n", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Robots.txt summary (simple)
|
||||||
|
robots := report.RobotsSummary{}
|
||||||
|
robotsURL := urlutil.Origin(target) + "/robots.txt"
|
||||||
|
{
|
||||||
|
req, _ := http.NewRequestWithContext(ctx, http.MethodGet, robotsURL, nil)
|
||||||
|
req.Header.Set("User-Agent", userAgent)
|
||||||
|
resp, err := client.Do(req)
|
||||||
|
if err == nil {
|
||||||
|
defer resp.Body.Close()
|
||||||
|
if resp.StatusCode == http.StatusOK {
|
||||||
|
robots.Present = true
|
||||||
|
robots.FetchedAt = time.Now().UTC().Format(time.RFC3339)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Build set of all unique links discovered across pages for status checks
|
// Build set of all unique links discovered across pages for status checks
|
||||||
allLinks := make(map[string]struct{})
|
allLinks := make(map[string]struct{})
|
||||||
for _, m := range outlinks {
|
for _, m := range outlinks {
|
||||||
|
@ -137,8 +170,35 @@ func main() {
|
||||||
urlsVisited.Load(), urlsErrored.Load())
|
urlsVisited.Load(), urlsErrored.Load())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
finished := time.Now()
|
||||||
|
meta.FinishedAt = finished.UTC().Format(time.RFC3339)
|
||||||
|
meta.DurationMs = finished.Sub(started).Milliseconds()
|
||||||
|
|
||||||
fmt.Fprintf(os.Stderr, "Building report...\n")
|
fmt.Fprintf(os.Stderr, "Building report...\n")
|
||||||
reports := report.Build(target, visited, smURLs, crawlErrs, checkResults, outlinks)
|
|
||||||
|
// Convert pageInfo to report.PageMeta
|
||||||
|
pages := make(map[string]report.PageMeta, len(pageInfo))
|
||||||
|
for u, pi := range pageInfo {
|
||||||
|
pages[u] = report.PageMeta{
|
||||||
|
Title: pi.Title,
|
||||||
|
ResponseTimeMs: pi.ResponseTimeMs,
|
||||||
|
ContentLength: pi.ContentLength,
|
||||||
|
Depth: pi.Depth,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
reports := report.Build(target, visited, smURLs, crawlErrs, checkResults, outlinks, meta, params, pages, robots)
|
||||||
|
|
||||||
|
if exportDir != "" {
|
||||||
|
if err := exportAll(exportDir, reports); err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "export error: %v\n", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save JSON report to ./reports/<host>.json by default (ignored by git)
|
||||||
|
if err := saveReportJSON("reports", reports); err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "save report error: %v\n", err)
|
||||||
|
}
|
||||||
|
|
||||||
switch output {
|
switch output {
|
||||||
case "json":
|
case "json":
|
||||||
|
@ -160,3 +220,142 @@ func truncateForTTY(s string, max int) string {
|
||||||
}
|
}
|
||||||
return s[:max-1] + "…"
|
return s[:max-1] + "…"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func exportAll(baseDir string, r report.Report) error {
|
||||||
|
u, err := url.Parse(r.Target)
|
||||||
|
if err != nil || u.Host == "" {
|
||||||
|
return fmt.Errorf("invalid target for export: %s", r.Target)
|
||||||
|
}
|
||||||
|
dir := filepath.Join(baseDir, u.Host)
|
||||||
|
if err := os.MkdirAll(dir, 0o755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := exportCSVPages(filepath.Join(dir, "pages.csv"), r); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := exportCSVLinks(filepath.Join(dir, "links.csv"), r); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := exportNDJSON(filepath.Join(dir, "pages.ndjson"), pagesToNDJSON(r)); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := exportNDJSON(filepath.Join(dir, "links.ndjson"), linksToNDJSON(r)); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := exportNDJSON(filepath.Join(dir, "link_statuses.ndjson"), linkStatusesToNDJSON(r)); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func exportCSVPages(path string, r report.Report) error {
|
||||||
|
f, err := os.Create(path)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
w := csv.NewWriter(f)
|
||||||
|
defer w.Flush()
|
||||||
|
_ = w.Write([]string{"url", "title", "responseTimeMs", "contentLength", "depth"})
|
||||||
|
for u, pm := range r.Pages {
|
||||||
|
rec := []string{u, pm.Title, fmt.Sprintf("%d", pm.ResponseTimeMs), fmt.Sprintf("%d", pm.ContentLength), fmt.Sprintf("%d", pm.Depth)}
|
||||||
|
_ = w.Write(rec)
|
||||||
|
}
|
||||||
|
return w.Error()
|
||||||
|
}
|
||||||
|
|
||||||
|
func exportCSVLinks(path string, r report.Report) error {
|
||||||
|
f, err := os.Create(path)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
w := csv.NewWriter(f)
|
||||||
|
defer w.Flush()
|
||||||
|
_ = w.Write([]string{"sourceUrl", "targetUrl"})
|
||||||
|
for src, lst := range r.PageOutlinks {
|
||||||
|
for _, dst := range lst {
|
||||||
|
_ = w.Write([]string{src, dst})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return w.Error()
|
||||||
|
}
|
||||||
|
|
||||||
|
type ndjsonItem interface{}
|
||||||
|
|
||||||
|
func exportNDJSON(path string, items []ndjsonItem) error {
|
||||||
|
f, err := os.Create(path)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
enc := json.NewEncoder(f)
|
||||||
|
for _, it := range items {
|
||||||
|
if err := enc.Encode(it); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func pagesToNDJSON(r report.Report) []ndjsonItem {
|
||||||
|
res := make([]ndjsonItem, 0, len(r.Pages))
|
||||||
|
for u, pm := range r.Pages {
|
||||||
|
res = append(res, map[string]any{
|
||||||
|
"type": "page",
|
||||||
|
"url": u,
|
||||||
|
"title": pm.Title,
|
||||||
|
"responseTimeMs": pm.ResponseTimeMs,
|
||||||
|
"contentLength": pm.ContentLength,
|
||||||
|
"depth": pm.Depth,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
|
||||||
|
func linksToNDJSON(r report.Report) []ndjsonItem {
|
||||||
|
var res []ndjsonItem
|
||||||
|
for src, lst := range r.PageOutlinks {
|
||||||
|
for _, dst := range lst {
|
||||||
|
res = append(res, map[string]any{
|
||||||
|
"type": "link",
|
||||||
|
"src": src,
|
||||||
|
"dest": dst,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
|
||||||
|
func linkStatusesToNDJSON(r report.Report) []ndjsonItem {
|
||||||
|
res := make([]ndjsonItem, 0, len(r.LinkStatuses))
|
||||||
|
for _, ls := range r.LinkStatuses {
|
||||||
|
res = append(res, map[string]any{
|
||||||
|
"type": "link_status",
|
||||||
|
"url": ls.URL,
|
||||||
|
"statusCode": ls.StatusCode,
|
||||||
|
"ok": ls.OK,
|
||||||
|
"error": ls.Err,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
|
||||||
|
func saveReportJSON(baseDir string, r report.Report) error {
|
||||||
|
u, err := url.Parse(r.Target)
|
||||||
|
if err != nil || u.Host == "" {
|
||||||
|
return fmt.Errorf("invalid target for save: %s", r.Target)
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(baseDir, 0o755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
path := filepath.Join(baseDir, u.Host+".json")
|
||||||
|
f, err := os.Create(path)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
enc := json.NewEncoder(f)
|
||||||
|
enc.SetIndent("", " ")
|
||||||
|
return enc.Encode(r)
|
||||||
|
}
|
||||||
|
|
|
@ -1,59 +0,0 @@
|
||||||
## URLCrawler Report JSON Schema
|
|
||||||
|
|
||||||
This document describes the structure of the JSON reports produced by `urlcrawler` when run with `-output json`.
|
|
||||||
|
|
||||||
### Top-level object
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"target": "https://example.com",
|
|
||||||
"crawledUrls": ["https://example.com", "https://example.com/about"],
|
|
||||||
"sitemapUrls": ["https://example.com", "https://example.com/about"],
|
|
||||||
"crawlErrors": {"https://bad.example": "error string"},
|
|
||||||
"linkStatuses": [
|
|
||||||
{"url": "https://example.com", "statusCode": 200, "ok": true},
|
|
||||||
{"url": "https://other.example/broken", "statusCode": 404, "ok": false, "error": "..."}
|
|
||||||
],
|
|
||||||
"pageOutlinks": {
|
|
||||||
"https://example.com": ["https://example.com/about", "https://other.example/"]
|
|
||||||
},
|
|
||||||
"linkSources": {
|
|
||||||
"https://example.com/about": ["https://example.com"]
|
|
||||||
},
|
|
||||||
"missingInSitemap": ["https://example.com/page-not-in-sitemap"],
|
|
||||||
"inSitemapNotCrawled": ["https://example.com/deferred"]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Fields
|
|
||||||
|
|
||||||
- **target** (string): Normalized start URL used for the crawl.
|
|
||||||
|
|
||||||
- **crawledUrls** (string[]): Unique URLs that were visited during crawling. Sorted for stability.
|
|
||||||
|
|
||||||
- **sitemapUrls** (string[]; optional): All URLs discovered via `sitemap.xml` (and nested sitemaps). Present unless the sitemap is not found.
|
|
||||||
|
|
||||||
- **crawlErrors** (object map<string,string>; optional): Maps URL → error message for requests that failed (e.g., network/TLS/timeouts). Only set when errors occurred.
|
|
||||||
|
|
||||||
- **linkStatuses** (LinkStatus[]): Result of HTTP status checks for all unique links discovered (including the pages themselves).
|
|
||||||
- **url** (string): The checked URL.
|
|
||||||
- **statusCode** (number): HTTP status code (0 if request failed before a response was received).
|
|
||||||
- **ok** (boolean): Convenience flag, true when `200 ≤ statusCode < 400` and no error occurred.
|
|
||||||
- **error** (string; optional): Error string when a request failed or there was another client error.
|
|
||||||
|
|
||||||
- **pageOutlinks** (object map<string,string[]>): For each crawled page URL, the list of normalized outgoing links (internal and external).
|
|
||||||
|
|
||||||
- **linkSources** (object map<string,string[]>): Inverse index: for each discovered link URL, the list of page URLs where it appeared.
|
|
||||||
|
|
||||||
- **missingInSitemap** (string[]; optional): URLs that were crawled but not present in the sitemap.
|
|
||||||
|
|
||||||
- **inSitemapNotCrawled** (string[]; optional): URLs present in the sitemap that were not crawled (e.g., due to depth limits or off-host rules).
|
|
||||||
|
|
||||||
### Notes
|
|
||||||
|
|
||||||
- URLs are normalized and deduplicated during crawl.
|
|
||||||
- Content-type filtering: only `text/html` pages are parsed for outlinks.
|
|
||||||
- Sitemap fetching is best-effort; absence is not treated as an error.
|
|
||||||
- The JSON lists are sorted to produce stable outputs across runs.
|
|
||||||
|
|
||||||
|
|
|
@ -1,290 +0,0 @@
|
||||||
{
|
|
||||||
"target": "https://titan-training.ca",
|
|
||||||
"crawledUrls": [
|
|
||||||
"https://titan-training.ca",
|
|
||||||
"https://titan-training.ca/",
|
|
||||||
"https://titan-training.ca/.",
|
|
||||||
"https://titan-training.ca/cdn-cgi/l/email-protection",
|
|
||||||
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
|
|
||||||
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
|
|
||||||
"https://titan-training.ca/products-list",
|
|
||||||
"https://titan-training.ca/titan-training.ca"
|
|
||||||
],
|
|
||||||
"sitemapUrls": [
|
|
||||||
"https://titan-training.ca/home",
|
|
||||||
"https://titan-training.ca/test_path?item=123"
|
|
||||||
],
|
|
||||||
"linkStatuses": [
|
|
||||||
{
|
|
||||||
"url": "https://titan-training.ca/products-list",
|
|
||||||
"statusCode": 200,
|
|
||||||
"ok": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://titan-training.ca/",
|
|
||||||
"statusCode": 200,
|
|
||||||
"ok": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.cloudflare.com/5xx-error-landing",
|
|
||||||
"statusCode": 200,
|
|
||||||
"ok": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://titan-training.ca",
|
|
||||||
"statusCode": 200,
|
|
||||||
"ok": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.facebook.com/titantrainingkw",
|
|
||||||
"statusCode": 200,
|
|
||||||
"ok": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public",
|
|
||||||
"statusCode": 200,
|
|
||||||
"ok": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://developers.cloudflare.com/waf/tools/scrape-shield/email-address-obfuscation",
|
|
||||||
"statusCode": 200,
|
|
||||||
"ok": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://developers.cloudflare.com/fundamentals/setup/account/create-account",
|
|
||||||
"statusCode": 200,
|
|
||||||
"ok": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
|
|
||||||
"statusCode": 200,
|
|
||||||
"ok": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.cloudflare.com/sign-up?utm_source=email_protection",
|
|
||||||
"statusCode": 403,
|
|
||||||
"ok": false
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://titan-training.ca/cdn-cgi/l/email-protection",
|
|
||||||
"statusCode": 200,
|
|
||||||
"ok": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://titan-training.ca/titan-training.ca",
|
|
||||||
"statusCode": 200,
|
|
||||||
"ok": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://titan-training.ca/product-details/product/681331db52e2115c63435275",
|
|
||||||
"statusCode": 200,
|
|
||||||
"ok": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://titan-training.ca/.",
|
|
||||||
"statusCode": 200,
|
|
||||||
"ok": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.google.com/maps/search?api=1\u0026query=Google\u0026query_place_id=ChIJwzSnW430K4gRU8zOBshqKAg",
|
|
||||||
"statusCode": 404,
|
|
||||||
"ok": false
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.instagram.com/titan__training",
|
|
||||||
"statusCode": 200,
|
|
||||||
"ok": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://titan-training.ca/product-details/product/titan-training.ca",
|
|
||||||
"statusCode": 200,
|
|
||||||
"ok": true
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"pageOutlinks": {
|
|
||||||
"https://titan-training.ca": [
|
|
||||||
"https://titan-training.ca/.",
|
|
||||||
"https://titan-training.ca/cdn-cgi/l/email-protection",
|
|
||||||
"https://titan-training.ca/products-list",
|
|
||||||
"https://titan-training.ca/titan-training.ca",
|
|
||||||
"https://www.facebook.com/titantrainingkw",
|
|
||||||
"https://www.google.com/maps/search?api=1\u0026query=Google\u0026query_place_id=ChIJwzSnW430K4gRU8zOBshqKAg",
|
|
||||||
"https://www.instagram.com/titan__training",
|
|
||||||
"https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public"
|
|
||||||
],
|
|
||||||
"https://titan-training.ca/": [
|
|
||||||
"https://titan-training.ca/",
|
|
||||||
"https://titan-training.ca/cdn-cgi/l/email-protection",
|
|
||||||
"https://titan-training.ca/products-list",
|
|
||||||
"https://titan-training.ca/titan-training.ca",
|
|
||||||
"https://www.facebook.com/titantrainingkw",
|
|
||||||
"https://www.google.com/maps/search?api=1\u0026query=Google\u0026query_place_id=ChIJwzSnW430K4gRU8zOBshqKAg",
|
|
||||||
"https://www.instagram.com/titan__training",
|
|
||||||
"https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public"
|
|
||||||
],
|
|
||||||
"https://titan-training.ca/.": [
|
|
||||||
"https://titan-training.ca/",
|
|
||||||
"https://titan-training.ca/cdn-cgi/l/email-protection",
|
|
||||||
"https://titan-training.ca/products-list",
|
|
||||||
"https://titan-training.ca/titan-training.ca",
|
|
||||||
"https://www.facebook.com/titantrainingkw",
|
|
||||||
"https://www.google.com/maps/search?api=1\u0026query=Google\u0026query_place_id=ChIJwzSnW430K4gRU8zOBshqKAg",
|
|
||||||
"https://www.instagram.com/titan__training",
|
|
||||||
"https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public"
|
|
||||||
],
|
|
||||||
"https://titan-training.ca/cdn-cgi/l/email-protection": [
|
|
||||||
"https://developers.cloudflare.com/fundamentals/setup/account/create-account",
|
|
||||||
"https://developers.cloudflare.com/waf/tools/scrape-shield/email-address-obfuscation",
|
|
||||||
"https://www.cloudflare.com/5xx-error-landing",
|
|
||||||
"https://www.cloudflare.com/sign-up?utm_source=email_protection"
|
|
||||||
],
|
|
||||||
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8": [
|
|
||||||
"https://titan-training.ca/",
|
|
||||||
"https://titan-training.ca/cdn-cgi/l/email-protection",
|
|
||||||
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
|
|
||||||
"https://titan-training.ca/product-details/product/titan-training.ca",
|
|
||||||
"https://titan-training.ca/products-list",
|
|
||||||
"https://www.facebook.com/titantrainingkw",
|
|
||||||
"https://www.instagram.com/titan__training",
|
|
||||||
"https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public"
|
|
||||||
],
|
|
||||||
"https://titan-training.ca/product-details/product/681331db52e2115c63435275": [
|
|
||||||
"https://titan-training.ca/",
|
|
||||||
"https://titan-training.ca/cdn-cgi/l/email-protection",
|
|
||||||
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
|
|
||||||
"https://titan-training.ca/product-details/product/titan-training.ca",
|
|
||||||
"https://titan-training.ca/products-list",
|
|
||||||
"https://www.facebook.com/titantrainingkw",
|
|
||||||
"https://www.instagram.com/titan__training",
|
|
||||||
"https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public"
|
|
||||||
],
|
|
||||||
"https://titan-training.ca/products-list": [
|
|
||||||
"https://titan-training.ca/",
|
|
||||||
"https://titan-training.ca/cdn-cgi/l/email-protection",
|
|
||||||
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
|
|
||||||
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
|
|
||||||
"https://titan-training.ca/products-list",
|
|
||||||
"https://titan-training.ca/titan-training.ca",
|
|
||||||
"https://www.facebook.com/titantrainingkw",
|
|
||||||
"https://www.instagram.com/titan__training",
|
|
||||||
"https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public"
|
|
||||||
],
|
|
||||||
"https://titan-training.ca/titan-training.ca": [
|
|
||||||
"https://titan-training.ca/cdn-cgi/l/email-protection",
|
|
||||||
"https://titan-training.ca/products-list",
|
|
||||||
"https://titan-training.ca/titan-training.ca",
|
|
||||||
"https://www.facebook.com/titantrainingkw",
|
|
||||||
"https://www.google.com/maps/search?api=1\u0026query=Google\u0026query_place_id=ChIJwzSnW430K4gRU8zOBshqKAg",
|
|
||||||
"https://www.instagram.com/titan__training",
|
|
||||||
"https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"linkSources": {
|
|
||||||
"https://developers.cloudflare.com/fundamentals/setup/account/create-account": [
|
|
||||||
"https://titan-training.ca/cdn-cgi/l/email-protection"
|
|
||||||
],
|
|
||||||
"https://developers.cloudflare.com/waf/tools/scrape-shield/email-address-obfuscation": [
|
|
||||||
"https://titan-training.ca/cdn-cgi/l/email-protection"
|
|
||||||
],
|
|
||||||
"https://titan-training.ca/": [
|
|
||||||
"https://titan-training.ca/",
|
|
||||||
"https://titan-training.ca/.",
|
|
||||||
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
|
|
||||||
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
|
|
||||||
"https://titan-training.ca/products-list"
|
|
||||||
],
|
|
||||||
"https://titan-training.ca/.": [
|
|
||||||
"https://titan-training.ca"
|
|
||||||
],
|
|
||||||
"https://titan-training.ca/cdn-cgi/l/email-protection": [
|
|
||||||
"https://titan-training.ca/",
|
|
||||||
"https://titan-training.ca/.",
|
|
||||||
"https://titan-training.ca/titan-training.ca",
|
|
||||||
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
|
|
||||||
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
|
|
||||||
"https://titan-training.ca",
|
|
||||||
"https://titan-training.ca/products-list"
|
|
||||||
],
|
|
||||||
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8": [
|
|
||||||
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
|
|
||||||
"https://titan-training.ca/products-list"
|
|
||||||
],
|
|
||||||
"https://titan-training.ca/product-details/product/681331db52e2115c63435275": [
|
|
||||||
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
|
|
||||||
"https://titan-training.ca/products-list"
|
|
||||||
],
|
|
||||||
"https://titan-training.ca/product-details/product/titan-training.ca": [
|
|
||||||
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
|
|
||||||
"https://titan-training.ca/product-details/product/681331db52e2115c63435275"
|
|
||||||
],
|
|
||||||
"https://titan-training.ca/products-list": [
|
|
||||||
"https://titan-training.ca/",
|
|
||||||
"https://titan-training.ca/.",
|
|
||||||
"https://titan-training.ca/titan-training.ca",
|
|
||||||
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
|
|
||||||
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
|
|
||||||
"https://titan-training.ca",
|
|
||||||
"https://titan-training.ca/products-list"
|
|
||||||
],
|
|
||||||
"https://titan-training.ca/titan-training.ca": [
|
|
||||||
"https://titan-training.ca/",
|
|
||||||
"https://titan-training.ca/.",
|
|
||||||
"https://titan-training.ca/titan-training.ca",
|
|
||||||
"https://titan-training.ca",
|
|
||||||
"https://titan-training.ca/products-list"
|
|
||||||
],
|
|
||||||
"https://www.cloudflare.com/5xx-error-landing": [
|
|
||||||
"https://titan-training.ca/cdn-cgi/l/email-protection"
|
|
||||||
],
|
|
||||||
"https://www.cloudflare.com/sign-up?utm_source=email_protection": [
|
|
||||||
"https://titan-training.ca/cdn-cgi/l/email-protection"
|
|
||||||
],
|
|
||||||
"https://www.facebook.com/titantrainingkw": [
|
|
||||||
"https://titan-training.ca/",
|
|
||||||
"https://titan-training.ca/.",
|
|
||||||
"https://titan-training.ca/titan-training.ca",
|
|
||||||
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
|
|
||||||
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
|
|
||||||
"https://titan-training.ca",
|
|
||||||
"https://titan-training.ca/products-list"
|
|
||||||
],
|
|
||||||
"https://www.google.com/maps/search?api=1\u0026query=Google\u0026query_place_id=ChIJwzSnW430K4gRU8zOBshqKAg": [
|
|
||||||
"https://titan-training.ca/",
|
|
||||||
"https://titan-training.ca/.",
|
|
||||||
"https://titan-training.ca/titan-training.ca",
|
|
||||||
"https://titan-training.ca"
|
|
||||||
],
|
|
||||||
"https://www.instagram.com/titan__training": [
|
|
||||||
"https://titan-training.ca/",
|
|
||||||
"https://titan-training.ca/.",
|
|
||||||
"https://titan-training.ca/titan-training.ca",
|
|
||||||
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
|
|
||||||
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
|
|
||||||
"https://titan-training.ca",
|
|
||||||
"https://titan-training.ca/products-list"
|
|
||||||
],
|
|
||||||
"https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public": [
|
|
||||||
"https://titan-training.ca/",
|
|
||||||
"https://titan-training.ca/.",
|
|
||||||
"https://titan-training.ca/titan-training.ca",
|
|
||||||
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
|
|
||||||
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
|
|
||||||
"https://titan-training.ca",
|
|
||||||
"https://titan-training.ca/products-list"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"missingInSitemap": [
|
|
||||||
"https://titan-training.ca",
|
|
||||||
"https://titan-training.ca/",
|
|
||||||
"https://titan-training.ca/.",
|
|
||||||
"https://titan-training.ca/cdn-cgi/l/email-protection",
|
|
||||||
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
|
|
||||||
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
|
|
||||||
"https://titan-training.ca/products-list",
|
|
||||||
"https://titan-training.ca/titan-training.ca"
|
|
||||||
],
|
|
||||||
"inSitemapNotCrawled": [
|
|
||||||
"https://titan-training.ca/home",
|
|
||||||
"https://titan-training.ca/test_path?item=123"
|
|
||||||
]
|
|
||||||
}
|
|
Loading…
Reference in New Issue