feat: initial import with progress UI; docs: REPORT_SCHEMA, TODO; v0.0.1
This commit is contained in:
commit
e7b4d33971
|
@ -0,0 +1,45 @@
|
||||||
|
## Roadmap (post v0.0.1)
|
||||||
|
|
||||||
|
Prioritized from easiest/low-risk to more involved work. Check off as we ship.
|
||||||
|
|
||||||
|
### Quick wins (target v0.0.2)
|
||||||
|
- [ ] Add crawl metadata (startedAt, finishedAt, durationMs)
|
||||||
|
- [ ] Include run parameters in report (maxDepth, concurrency, timeout, userAgent, sameHostOnly)
|
||||||
|
- [ ] Status histogram (2xx/3xx/4xx/5xx totals) in summary
|
||||||
|
- [ ] Normalize and dedupe trailing `/.` URL variants in output
|
||||||
|
- [ ] Add compact `reportSummary` text block to JSON
|
||||||
|
- [ ] Top external domains with counts
|
||||||
|
- [ ] Broken links sample (first N) + per-domain broken counts
|
||||||
|
|
||||||
|
### Moderate scope
|
||||||
|
- [ ] Robots.txt summary (present, fetchedAt, sample disallow rules)
|
||||||
|
- [ ] Sitemap extras (index → child sitemaps, fetch errors)
|
||||||
|
- [ ] Per-page response time (responseTimeMs) and content length
|
||||||
|
- [ ] Basic page metadata: `<title>`, canonical (if present)
|
||||||
|
- [ ] Depth distribution (count of pages by depth)
|
||||||
|
- [ ] Duplicate title/canonical detection (lists of URLs)
|
||||||
|
|
||||||
|
### Content/asset analysis
|
||||||
|
- [ ] Extract assets (images/css/js) per page with status/type/size
|
||||||
|
- [ ] Mixed-content detection (http assets on https pages)
|
||||||
|
- [ ] Image accessibility metric (alt present ratio)
|
||||||
|
|
||||||
|
### Security and quality signals
|
||||||
|
- [ ] Security headers by host (HSTS, CSP, X-Frame-Options, Referrer-Policy)
|
||||||
|
- [ ] Insecure forms (http action on https page)
|
||||||
|
- [ ] Large pages and slow pages (p95 thresholds) summary
|
||||||
|
|
||||||
|
### Link behavior and graph
|
||||||
|
- [ ] Redirect map (from → to, hops; count summary)
|
||||||
|
- [ ] Indegree/outdegree stats; small graph summary
|
||||||
|
|
||||||
|
### Outputs and UX
|
||||||
|
- [ ] CSV exports: pages.csv, links.csv, assets.csv
|
||||||
|
- [ ] NDJSON export option for streaming pipelines
|
||||||
|
- [ ] Optional: include file/line anchors in JSON for large outputs
|
||||||
|
|
||||||
|
### Notes
|
||||||
|
- Keep JSON stable and sorted; avoid breaking changes. If we change fields, bump minor version and document in `reports/REPORT_SCHEMA.md`.
|
||||||
|
- Favor opt-in flags for heavier analyses (assets, headers) to keep default runs fast.
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
module urlcrawler
|
||||||
|
|
||||||
|
go 1.22
|
||||||
|
|
||||||
|
require golang.org/x/net v0.29.0
|
|
@ -0,0 +1,2 @@
|
||||||
|
golang.org/x/net v0.29.0 h1:5ORfpBpCs4HzDYoodCDBbwHzdR5UrLBZ3sOnUJmFoHo=
|
||||||
|
golang.org/x/net v0.29.0/go.mod h1:gLkgy8jTGERgjzMic6DS9+SP0ajcu6Xu3Orq/SpETg0=
|
|
@ -0,0 +1,145 @@
|
||||||
|
package crawler
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"io"
|
||||||
|
"net/http"
|
||||||
|
"sync"
|
||||||
|
|
||||||
|
"urlcrawler/internal/htmlx"
|
||||||
|
"urlcrawler/internal/urlutil"
|
||||||
|
)
|
||||||
|
|
||||||
|
type task struct {
|
||||||
|
url string
|
||||||
|
depth int
|
||||||
|
}
|
||||||
|
|
||||||
|
// Crawl visits pages up to maxDepth and returns visited set, per-URL errors, and per-page outgoing links.
|
||||||
|
// The visitedCallback and errorCallback functions are called when a page is successfully visited or encounters an error.
|
||||||
|
// visitedCallback receives the URL, its depth, and the current number of pending tasks in the queue.
|
||||||
|
// errorCallback receives the URL, the error, and the current number of pending tasks in the queue.
|
||||||
|
func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int, sameHostOnly bool, client *http.Client, userAgent string, visitedCallback func(string, int, int), errorCallback func(string, error, int)) (map[string]struct{}, map[string]error, map[string]map[string]struct{}) {
|
||||||
|
visited := make(map[string]struct{})
|
||||||
|
errs := make(map[string]error)
|
||||||
|
outlinks := make(map[string]map[string]struct{})
|
||||||
|
var mu sync.Mutex
|
||||||
|
|
||||||
|
origin := urlutil.Origin(startURL)
|
||||||
|
|
||||||
|
tasks := make(chan task, concurrency*2)
|
||||||
|
wgWorkers := sync.WaitGroup{}
|
||||||
|
wgTasks := sync.WaitGroup{}
|
||||||
|
|
||||||
|
enqueue := func(t task) {
|
||||||
|
wgTasks.Add(1)
|
||||||
|
tasks <- t
|
||||||
|
}
|
||||||
|
|
||||||
|
worker := func() {
|
||||||
|
defer wgWorkers.Done()
|
||||||
|
for tk := range tasks {
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
wgTasks.Done()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
mu.Lock()
|
||||||
|
if _, seen := visited[tk.url]; seen {
|
||||||
|
mu.Unlock()
|
||||||
|
wgTasks.Done()
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
visited[tk.url] = struct{}{}
|
||||||
|
mu.Unlock()
|
||||||
|
|
||||||
|
if visitedCallback != nil {
|
||||||
|
visitedCallback(tk.url, tk.depth, len(tasks))
|
||||||
|
}
|
||||||
|
|
||||||
|
req, _ := http.NewRequestWithContext(ctx, http.MethodGet, tk.url, nil)
|
||||||
|
req.Header.Set("User-Agent", userAgent)
|
||||||
|
resp, err := client.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
mu.Lock()
|
||||||
|
errs[tk.url] = err
|
||||||
|
mu.Unlock()
|
||||||
|
|
||||||
|
if errorCallback != nil {
|
||||||
|
errorCallback(tk.url, err, len(tasks))
|
||||||
|
}
|
||||||
|
wgTasks.Done()
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
func() {
|
||||||
|
defer resp.Body.Close()
|
||||||
|
ct := resp.Header.Get("Content-Type")
|
||||||
|
if resp.StatusCode != http.StatusOK || ct == "" || (ct != "text/html" && !hasPrefix(ct, "text/html")) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
body, _ := io.ReadAll(resp.Body)
|
||||||
|
hrefs := htmlx.ExtractAnchors(stringsReader(string(body)))
|
||||||
|
var toEnqueue []string
|
||||||
|
for _, href := range hrefs {
|
||||||
|
abs, ok := urlutil.Normalize(tk.url, href)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
mu.Lock()
|
||||||
|
m, ok2 := outlinks[tk.url]
|
||||||
|
if !ok2 {
|
||||||
|
m = make(map[string]struct{})
|
||||||
|
outlinks[tk.url] = m
|
||||||
|
}
|
||||||
|
m[abs] = struct{}{}
|
||||||
|
mu.Unlock()
|
||||||
|
|
||||||
|
if tk.depth < maxDepth {
|
||||||
|
if !sameHostOnly || urlutil.SameHost(origin, abs) {
|
||||||
|
toEnqueue = append(toEnqueue, abs)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, u := range toEnqueue {
|
||||||
|
enqueue(task{url: u, depth: tk.depth + 1})
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
wgTasks.Done()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := 0; i < concurrency; i++ {
|
||||||
|
wgWorkers.Add(1)
|
||||||
|
go worker()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Close the tasks channel when all enqueued tasks are processed.
|
||||||
|
go func() {
|
||||||
|
wgTasks.Wait()
|
||||||
|
close(tasks)
|
||||||
|
}()
|
||||||
|
|
||||||
|
enqueue(task{url: startURL, depth: 0})
|
||||||
|
wgWorkers.Wait()
|
||||||
|
|
||||||
|
return visited, errs, outlinks
|
||||||
|
}
|
||||||
|
|
||||||
|
func hasPrefix(s string, prefix string) bool {
|
||||||
|
return len(s) >= len(prefix) && s[:len(prefix)] == prefix
|
||||||
|
}
|
||||||
|
|
||||||
|
// stringsReader avoids importing strings at package top for a single use.
|
||||||
|
func stringsReader(s string) io.Reader {
|
||||||
|
return &stringReader{str: s}
|
||||||
|
}
|
||||||
|
|
||||||
|
type stringReader struct{ str string }
|
||||||
|
|
||||||
|
func (r *stringReader) Read(p []byte) (int, error) {
|
||||||
|
if len(r.str) == 0 {
|
||||||
|
return 0, io.EOF
|
||||||
|
}
|
||||||
|
n := copy(p, r.str)
|
||||||
|
r.str = r.str[n:]
|
||||||
|
return n, nil
|
||||||
|
}
|
|
@ -0,0 +1,38 @@
|
||||||
|
package htmlx
|
||||||
|
|
||||||
|
import (
|
||||||
|
"io"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"golang.org/x/net/html"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ExtractAnchors returns all hrefs from <a> tags.
|
||||||
|
func ExtractAnchors(r io.Reader) []string {
|
||||||
|
tokens := html.NewTokenizer(r)
|
||||||
|
var hrefs []string
|
||||||
|
for {
|
||||||
|
t := tokens.Next()
|
||||||
|
switch t {
|
||||||
|
case html.StartTagToken, html.SelfClosingTagToken:
|
||||||
|
tn, hasAttr := tokens.TagName()
|
||||||
|
if string(tn) != "a" || !hasAttr {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for {
|
||||||
|
key, val, more := tokens.TagAttr()
|
||||||
|
if string(key) == "href" {
|
||||||
|
v := strings.TrimSpace(string(val))
|
||||||
|
if v != "" {
|
||||||
|
hrefs = append(hrefs, v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !more {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case html.ErrorToken:
|
||||||
|
return hrefs
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,84 @@
|
||||||
|
package linkcheck
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"net/http"
|
||||||
|
"sync"
|
||||||
|
)
|
||||||
|
|
||||||
|
type LinkStatus struct {
|
||||||
|
URL string `json:"url"`
|
||||||
|
StatusCode int `json:"statusCode"`
|
||||||
|
OK bool `json:"ok"`
|
||||||
|
Err string `json:"error,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type Results struct {
|
||||||
|
Statuses []LinkStatus `json:"statuses"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func Check(ctx context.Context, urls map[string]struct{}, concurrency int, client *http.Client, userAgent string, showProgress bool, progressCallback func(bool)) Results {
|
||||||
|
var mu sync.Mutex
|
||||||
|
var statuses []LinkStatus
|
||||||
|
|
||||||
|
type job struct{ u string }
|
||||||
|
jobs := make(chan job, concurrency*2)
|
||||||
|
wg := sync.WaitGroup{}
|
||||||
|
|
||||||
|
worker := func() {
|
||||||
|
defer wg.Done()
|
||||||
|
for j := range jobs {
|
||||||
|
status, err := headOrGet(ctx, client, userAgent, j.u)
|
||||||
|
ls := LinkStatus{URL: j.u, StatusCode: status}
|
||||||
|
if err != nil {
|
||||||
|
ls.Err = err.Error()
|
||||||
|
}
|
||||||
|
ls.OK = err == nil && status < 400 && status >= 200
|
||||||
|
mu.Lock()
|
||||||
|
statuses = append(statuses, ls)
|
||||||
|
mu.Unlock()
|
||||||
|
|
||||||
|
if progressCallback != nil {
|
||||||
|
progressCallback(ls.OK)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := 0; i < concurrency; i++ {
|
||||||
|
wg.Add(1)
|
||||||
|
go worker()
|
||||||
|
}
|
||||||
|
|
||||||
|
for u := range urls {
|
||||||
|
jobs <- job{u: u}
|
||||||
|
}
|
||||||
|
close(jobs)
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
|
return Results{Statuses: statuses}
|
||||||
|
}
|
||||||
|
|
||||||
|
func headOrGet(ctx context.Context, client *http.Client, userAgent string, u string) (int, error) {
|
||||||
|
req, _ := http.NewRequestWithContext(ctx, http.MethodHead, u, nil)
|
||||||
|
req.Header.Set("User-Agent", userAgent)
|
||||||
|
resp, err := client.Do(req)
|
||||||
|
if err == nil {
|
||||||
|
defer resp.Body.Close()
|
||||||
|
if resp.StatusCode >= 200 && resp.StatusCode < 400 {
|
||||||
|
return resp.StatusCode, nil
|
||||||
|
}
|
||||||
|
// Some servers don't support HEAD properly; fall back to GET for 4xx/405
|
||||||
|
if resp.StatusCode != http.StatusMethodNotAllowed && resp.StatusCode < 500 {
|
||||||
|
return resp.StatusCode, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Fallback GET
|
||||||
|
req2, _ := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
|
||||||
|
req2.Header.Set("User-Agent", userAgent)
|
||||||
|
resp2, err2 := client.Do(req2)
|
||||||
|
if err2 != nil {
|
||||||
|
return 0, err2
|
||||||
|
}
|
||||||
|
defer resp2.Body.Close()
|
||||||
|
return resp2.StatusCode, nil
|
||||||
|
}
|
|
@ -0,0 +1,96 @@
|
||||||
|
package report
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"sort"
|
||||||
|
|
||||||
|
"urlcrawler/internal/linkcheck"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Report struct {
|
||||||
|
Target string `json:"target"`
|
||||||
|
CrawledURLs []string `json:"crawledUrls"`
|
||||||
|
SitemapURLs []string `json:"sitemapUrls,omitempty"`
|
||||||
|
CrawlErrors map[string]string `json:"crawlErrors,omitempty"`
|
||||||
|
LinkStatuses []linkcheck.LinkStatus `json:"linkStatuses"`
|
||||||
|
PageOutlinks map[string][]string `json:"pageOutlinks"`
|
||||||
|
LinkSources map[string][]string `json:"linkSources"`
|
||||||
|
MissingInSitemap []string `json:"missingInSitemap,omitempty"`
|
||||||
|
InSitemapNotCrawled []string `json:"inSitemapNotCrawled,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}) Report {
|
||||||
|
crawledList := keys(crawled)
|
||||||
|
sitemapList := keys(sitemap)
|
||||||
|
crawlErrMap := make(map[string]string, len(crawlErrs))
|
||||||
|
for k, v := range crawlErrs {
|
||||||
|
crawlErrMap[k] = v.Error()
|
||||||
|
}
|
||||||
|
|
||||||
|
missing := difference(crawled, sitemap)
|
||||||
|
missingList := keys(missing)
|
||||||
|
inSmNotCrawled := difference(sitemap, crawled)
|
||||||
|
inSmNotCrawledList := keys(inSmNotCrawled)
|
||||||
|
|
||||||
|
pageOut := make(map[string][]string, len(outlinks))
|
||||||
|
linkSrc := make(map[string][]string)
|
||||||
|
for page, set := range outlinks {
|
||||||
|
lst := keys(set)
|
||||||
|
pageOut[page] = lst
|
||||||
|
for _, u := range lst {
|
||||||
|
linkSrc[u] = append(linkSrc[u], page)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Report{
|
||||||
|
Target: target,
|
||||||
|
CrawledURLs: crawledList,
|
||||||
|
SitemapURLs: sitemapList,
|
||||||
|
CrawlErrors: crawlErrMap,
|
||||||
|
LinkStatuses: check.Statuses,
|
||||||
|
PageOutlinks: pageOut,
|
||||||
|
LinkSources: linkSrc,
|
||||||
|
MissingInSitemap: missingList,
|
||||||
|
InSitemapNotCrawled: inSmNotCrawledList,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func PrintText(w io.Writer, r Report) {
|
||||||
|
fmt.Fprintf(w, "Target: %s\n\n", r.Target)
|
||||||
|
fmt.Fprintf(w, "Crawled URLs: %d\n", len(r.CrawledURLs))
|
||||||
|
fmt.Fprintf(w, "Sitemap URLs: %d\n", len(r.SitemapURLs))
|
||||||
|
fmt.Fprintf(w, "Links checked: %d\n", len(r.LinkStatuses))
|
||||||
|
fmt.Fprintf(w, "Missing in sitemap: %d\n", len(r.MissingInSitemap))
|
||||||
|
fmt.Fprintf(w, "In sitemap not crawled: %d\n\n", len(r.InSitemapNotCrawled))
|
||||||
|
|
||||||
|
// Keep text output concise; details available in JSON
|
||||||
|
}
|
||||||
|
|
||||||
|
func keys[T comparable](m map[T]struct{}) []T {
|
||||||
|
res := make([]T, 0, len(m))
|
||||||
|
for k := range m {
|
||||||
|
res = append(res, k)
|
||||||
|
}
|
||||||
|
sort.Slice(res, func(i, j int) bool { return asString(res[i]) < asString(res[j]) })
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
|
||||||
|
func asString[T any](v T) string {
|
||||||
|
switch x := any(v).(type) {
|
||||||
|
case string:
|
||||||
|
return x
|
||||||
|
default:
|
||||||
|
return fmt.Sprintf("%v", v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func difference(a, b map[string]struct{}) map[string]struct{} {
|
||||||
|
res := make(map[string]struct{})
|
||||||
|
for k := range a {
|
||||||
|
if _, ok := b[k]; !ok {
|
||||||
|
res[k] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res
|
||||||
|
}
|
|
@ -0,0 +1,110 @@
|
||||||
|
package sitemap
|
||||||
|
|
||||||
|
import (
|
||||||
|
"compress/gzip"
|
||||||
|
"context"
|
||||||
|
"encoding/xml"
|
||||||
|
"errors"
|
||||||
|
"io"
|
||||||
|
"net/http"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"urlcrawler/internal/urlutil"
|
||||||
|
)
|
||||||
|
|
||||||
|
var ErrNotFound = errors.New("sitemap not found")
|
||||||
|
|
||||||
|
// FetchAll attempts to fetch /sitemap.xml and /sitemap_index.xml, parse URLs, and follow indexes.
|
||||||
|
func FetchAll(ctx context.Context, target string, client *http.Client, userAgent string) (map[string]struct{}, error) {
|
||||||
|
origin := urlutil.Origin(target)
|
||||||
|
candidates := []string{origin + "/sitemap.xml", origin + "/sitemap_index.xml"}
|
||||||
|
found := make(map[string]struct{})
|
||||||
|
var any bool
|
||||||
|
for _, u := range candidates {
|
||||||
|
urls, err := fetchOne(ctx, u, client, userAgent)
|
||||||
|
if err == nil && len(urls) > 0 {
|
||||||
|
any = true
|
||||||
|
for v := range urls {
|
||||||
|
found[v] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !any {
|
||||||
|
return found, ErrNotFound
|
||||||
|
}
|
||||||
|
return found, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func fetchOne(ctx context.Context, u string, client *http.Client, userAgent string) (map[string]struct{}, error) {
|
||||||
|
req, _ := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
|
||||||
|
req.Header.Set("User-Agent", userAgent)
|
||||||
|
resp, err := client.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer resp.Body.Close()
|
||||||
|
if resp.StatusCode != http.StatusOK {
|
||||||
|
return nil, errors.New(resp.Status)
|
||||||
|
}
|
||||||
|
var r io.Reader = resp.Body
|
||||||
|
if strings.HasSuffix(strings.ToLower(u), ".gz") || strings.Contains(strings.ToLower(resp.Header.Get("Content-Type")), "gzip") {
|
||||||
|
gz, err := gzip.NewReader(resp.Body)
|
||||||
|
if err == nil {
|
||||||
|
r = gz
|
||||||
|
defer gz.Close()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
data, _ := io.ReadAll(r)
|
||||||
|
ct := strings.ToLower(resp.Header.Get("Content-Type"))
|
||||||
|
if strings.Contains(ct, "/xml") || strings.Contains(ct, "+xml") || strings.HasSuffix(strings.ToLower(u), ".xml") || true {
|
||||||
|
return parseSitemapXML(ctx, client, userAgent, data)
|
||||||
|
}
|
||||||
|
return nil, errors.New("unsupported sitemap content-type")
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseSitemapXML(ctx context.Context, client *http.Client, userAgent string, data []byte) (map[string]struct{}, error) {
|
||||||
|
type urlEntry struct {
|
||||||
|
Loc string `xml:"loc"`
|
||||||
|
}
|
||||||
|
type urlSet struct {
|
||||||
|
URLs []urlEntry `xml:"url"`
|
||||||
|
}
|
||||||
|
type indexEntry struct {
|
||||||
|
Loc string `xml:"loc"`
|
||||||
|
}
|
||||||
|
type siteIndex struct {
|
||||||
|
Sitemaps []indexEntry `xml:"sitemap"`
|
||||||
|
}
|
||||||
|
|
||||||
|
found := make(map[string]struct{})
|
||||||
|
|
||||||
|
// First try urlset
|
||||||
|
var us urlSet
|
||||||
|
if err := xml.Unmarshal(data, &us); err == nil && len(us.URLs) > 0 {
|
||||||
|
for _, e := range us.URLs {
|
||||||
|
loc := strings.TrimSpace(e.Loc)
|
||||||
|
if loc != "" {
|
||||||
|
found[loc] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return found, nil
|
||||||
|
}
|
||||||
|
// Then try index
|
||||||
|
var si siteIndex
|
||||||
|
if err := xml.Unmarshal(data, &si); err == nil && len(si.Sitemaps) > 0 {
|
||||||
|
for _, e := range si.Sitemaps {
|
||||||
|
loc := strings.TrimSpace(e.Loc)
|
||||||
|
if loc == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
child, err := fetchOne(ctx, loc, client, userAgent)
|
||||||
|
if err == nil {
|
||||||
|
for v := range child {
|
||||||
|
found[v] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return found, nil
|
||||||
|
}
|
||||||
|
return found, errors.New("unrecognized sitemap XML")
|
||||||
|
}
|
|
@ -0,0 +1,62 @@
|
||||||
|
package urlutil
|
||||||
|
|
||||||
|
import (
|
||||||
|
"net/url"
|
||||||
|
"path"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Normalize resolves href against base, strips fragments, and cleans path.
|
||||||
|
func Normalize(baseURL string, href string) (string, bool) {
|
||||||
|
if href == "" {
|
||||||
|
return "", false
|
||||||
|
}
|
||||||
|
if strings.HasPrefix(href, "javascript:") || strings.HasPrefix(href, "mailto:") || strings.HasPrefix(href, "tel:") {
|
||||||
|
return "", false
|
||||||
|
}
|
||||||
|
|
||||||
|
b, err := url.Parse(baseURL)
|
||||||
|
if err != nil {
|
||||||
|
return "", false
|
||||||
|
}
|
||||||
|
u, err := url.Parse(href)
|
||||||
|
if err != nil {
|
||||||
|
return "", false
|
||||||
|
}
|
||||||
|
// Resolve relative links.
|
||||||
|
u = b.ResolveReference(u)
|
||||||
|
// Only http/https
|
||||||
|
if u.Scheme != "http" && u.Scheme != "https" {
|
||||||
|
return "", false
|
||||||
|
}
|
||||||
|
// Drop fragments
|
||||||
|
u.Fragment = ""
|
||||||
|
// Clean path
|
||||||
|
u.Path = path.Clean(u.Path)
|
||||||
|
return u.String(), true
|
||||||
|
}
|
||||||
|
|
||||||
|
// SameHost returns true if url shares the same host (including port) as base.
|
||||||
|
func SameHost(baseURL string, candidate string) bool {
|
||||||
|
b, err := url.Parse(baseURL)
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
u, err := url.Parse(candidate)
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return strings.EqualFold(b.Host, u.Host)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Origin returns scheme://host of a URL.
|
||||||
|
func Origin(raw string) string {
|
||||||
|
u, err := url.Parse(raw)
|
||||||
|
if err != nil {
|
||||||
|
return raw
|
||||||
|
}
|
||||||
|
u.Path = ""
|
||||||
|
u.RawQuery = ""
|
||||||
|
u.Fragment = ""
|
||||||
|
return u.Scheme + "://" + u.Host
|
||||||
|
}
|
|
@ -0,0 +1,7 @@
|
||||||
|
https://www.blackswanstrength.com
|
||||||
|
https://www.blackswanstrength.com/.
|
||||||
|
https://www.blackswanstrength.com/booking
|
||||||
|
https://www.blackswanstrength.com/client-testimonials
|
||||||
|
https://www.blackswanstrength.com/home
|
||||||
|
https://www.blackswanstrength.com/like-a-dog-chasing-prs
|
||||||
|
https://www.blackswanstrength.com/online-coaching
|
|
@ -0,0 +1,8 @@
|
||||||
|
https://www.blackswanstrength.com
|
||||||
|
https://www.blackswanstrength.com/
|
||||||
|
https://www.blackswanstrength.com/.
|
||||||
|
https://www.blackswanstrength.com/booking
|
||||||
|
https://www.blackswanstrength.com/client-testimonials
|
||||||
|
https://www.blackswanstrength.com/home
|
||||||
|
https://www.blackswanstrength.com/like-a-dog-chasing-prs
|
||||||
|
https://www.blackswanstrength.com/online-coaching
|
|
@ -0,0 +1,162 @@
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"flag"
|
||||||
|
"fmt"
|
||||||
|
"net/http"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
"sync/atomic"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"urlcrawler/internal/crawler"
|
||||||
|
"urlcrawler/internal/linkcheck"
|
||||||
|
"urlcrawler/internal/report"
|
||||||
|
"urlcrawler/internal/sitemap"
|
||||||
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
var target string
|
||||||
|
var concurrency int
|
||||||
|
var timeout time.Duration
|
||||||
|
var maxDepth int
|
||||||
|
var userAgent string
|
||||||
|
var sameHostOnly bool
|
||||||
|
var output string
|
||||||
|
var quiet bool
|
||||||
|
|
||||||
|
flag.StringVar(&target, "target", "", "Target site URL (e.g., https://example.com)")
|
||||||
|
flag.IntVar(&concurrency, "concurrency", 10, "Number of concurrent workers")
|
||||||
|
flag.DurationVar(&timeout, "timeout", 10*time.Second, "HTTP timeout per request")
|
||||||
|
flag.IntVar(&maxDepth, "max-depth", 2, "Maximum crawl depth (0=crawl only the start page)")
|
||||||
|
flag.StringVar(&userAgent, "user-agent", "urlcrawler/1.0", "User-Agent header value")
|
||||||
|
flag.BoolVar(&sameHostOnly, "same-host-only", true, "Limit crawl to the same host as target")
|
||||||
|
flag.StringVar(&output, "output", "text", "Output format: text|json")
|
||||||
|
flag.BoolVar(&quiet, "quiet", false, "Suppress progress output")
|
||||||
|
flag.Parse()
|
||||||
|
|
||||||
|
if strings.TrimSpace(target) == "" {
|
||||||
|
fmt.Fprintln(os.Stderr, "-target is required")
|
||||||
|
flag.Usage()
|
||||||
|
os.Exit(2)
|
||||||
|
}
|
||||||
|
|
||||||
|
client := &http.Client{Timeout: timeout}
|
||||||
|
ctx := context.Background()
|
||||||
|
|
||||||
|
fmt.Fprintf(os.Stderr, "Starting crawl of %s (depth: %d)...\n", target, maxDepth)
|
||||||
|
|
||||||
|
// Setup progress counters
|
||||||
|
var urlsVisited, urlsErrored atomic.Int64
|
||||||
|
var currentURL atomic.Value // string
|
||||||
|
var pendingTasks atomic.Int64
|
||||||
|
|
||||||
|
// Start progress reporter if not in quiet mode
|
||||||
|
ctxWithCancel, cancel := context.WithCancel(ctx)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
if !quiet {
|
||||||
|
go func() {
|
||||||
|
ticker := time.NewTicker(500 * time.Millisecond)
|
||||||
|
defer ticker.Stop()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ticker.C:
|
||||||
|
cu, _ := currentURL.Load().(string)
|
||||||
|
fmt.Fprintf(os.Stderr, "\rURLs visited: %d | Errors: %d | Pending: %d | Current: %s",
|
||||||
|
urlsVisited.Load(), urlsErrored.Load(), pendingTasks.Load(), truncateForTTY(cu, 90))
|
||||||
|
case <-ctxWithCancel.Done():
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Progress callback functions
|
||||||
|
visitedCallback := func(u string, depth int, pending int) {
|
||||||
|
urlsVisited.Add(1)
|
||||||
|
pendingTasks.Store(int64(pending))
|
||||||
|
currentURL.Store(u)
|
||||||
|
}
|
||||||
|
errorCallback := func(u string, err error, pending int) {
|
||||||
|
urlsErrored.Add(1)
|
||||||
|
pendingTasks.Store(int64(pending))
|
||||||
|
currentURL.Store(u)
|
||||||
|
}
|
||||||
|
|
||||||
|
visited, crawlErrs, outlinks := crawler.Crawl(ctx, target, maxDepth, concurrency, sameHostOnly, client, userAgent, visitedCallback, errorCallback)
|
||||||
|
|
||||||
|
// Clear progress line before moving to next phase
|
||||||
|
if !quiet {
|
||||||
|
fmt.Fprintf(os.Stderr, "\rCrawl complete! URLs visited: %d | Errors: %d\n",
|
||||||
|
urlsVisited.Load(), urlsErrored.Load())
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Fprintf(os.Stderr, "Fetching sitemap...\n")
|
||||||
|
smURLs, err := sitemap.FetchAll(ctx, target, client, userAgent)
|
||||||
|
if err != nil && !errors.Is(err, sitemap.ErrNotFound) {
|
||||||
|
fmt.Fprintf(os.Stderr, "sitemap error: %v\n", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build set of all unique links discovered across pages for status checks
|
||||||
|
allLinks := make(map[string]struct{})
|
||||||
|
for _, m := range outlinks {
|
||||||
|
for u := range m {
|
||||||
|
allLinks[u] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Also include the visited pages themselves
|
||||||
|
for u := range visited {
|
||||||
|
allLinks[u] = struct{}{}
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Fprintf(os.Stderr, "Checking %d links...\n", len(allLinks))
|
||||||
|
|
||||||
|
// Reset counters for link checking
|
||||||
|
urlsVisited.Store(0)
|
||||||
|
urlsErrored.Store(0)
|
||||||
|
|
||||||
|
// Progress callback functions for link checking
|
||||||
|
linkCheckCallback := func(ok bool) {
|
||||||
|
if ok {
|
||||||
|
urlsVisited.Add(1)
|
||||||
|
} else {
|
||||||
|
urlsErrored.Add(1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
checkResults := linkcheck.Check(ctx, allLinks, concurrency, client, userAgent, !quiet, linkCheckCallback)
|
||||||
|
|
||||||
|
// Clear progress line before finishing
|
||||||
|
if !quiet {
|
||||||
|
fmt.Fprintf(os.Stderr, "\rLink checking complete! OK: %d | Errors: %d\n",
|
||||||
|
urlsVisited.Load(), urlsErrored.Load())
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Fprintf(os.Stderr, "Building report...\n")
|
||||||
|
reports := report.Build(target, visited, smURLs, crawlErrs, checkResults, outlinks)
|
||||||
|
|
||||||
|
switch output {
|
||||||
|
case "json":
|
||||||
|
enc := json.NewEncoder(os.Stdout)
|
||||||
|
enc.SetIndent("", " ")
|
||||||
|
_ = enc.Encode(reports)
|
||||||
|
default:
|
||||||
|
report.PrintText(os.Stdout, reports)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// truncateForTTY truncates s to max characters, replacing the tail with … if needed.
|
||||||
|
func truncateForTTY(s string, max int) string {
|
||||||
|
if max <= 0 || len(s) <= max {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
if max <= 1 {
|
||||||
|
return "…"
|
||||||
|
}
|
||||||
|
return s[:max-1] + "…"
|
||||||
|
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,59 @@
|
||||||
|
## URLCrawler Report JSON Schema
|
||||||
|
|
||||||
|
This document describes the structure of the JSON reports produced by `urlcrawler` when run with `-output json`.
|
||||||
|
|
||||||
|
### Top-level object
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"target": "https://example.com",
|
||||||
|
"crawledUrls": ["https://example.com", "https://example.com/about"],
|
||||||
|
"sitemapUrls": ["https://example.com", "https://example.com/about"],
|
||||||
|
"crawlErrors": {"https://bad.example": "error string"},
|
||||||
|
"linkStatuses": [
|
||||||
|
{"url": "https://example.com", "statusCode": 200, "ok": true},
|
||||||
|
{"url": "https://other.example/broken", "statusCode": 404, "ok": false, "error": "..."}
|
||||||
|
],
|
||||||
|
"pageOutlinks": {
|
||||||
|
"https://example.com": ["https://example.com/about", "https://other.example/"]
|
||||||
|
},
|
||||||
|
"linkSources": {
|
||||||
|
"https://example.com/about": ["https://example.com"]
|
||||||
|
},
|
||||||
|
"missingInSitemap": ["https://example.com/page-not-in-sitemap"],
|
||||||
|
"inSitemapNotCrawled": ["https://example.com/deferred"]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Fields
|
||||||
|
|
||||||
|
- **target** (string): Normalized start URL used for the crawl.
|
||||||
|
|
||||||
|
- **crawledUrls** (string[]): Unique URLs that were visited during crawling. Sorted for stability.
|
||||||
|
|
||||||
|
- **sitemapUrls** (string[]; optional): All URLs discovered via `sitemap.xml` (and nested sitemaps). Present unless the sitemap is not found.
|
||||||
|
|
||||||
|
- **crawlErrors** (object map<string,string>; optional): Maps URL → error message for requests that failed (e.g., network/TLS/timeouts). Only set when errors occurred.
|
||||||
|
|
||||||
|
- **linkStatuses** (LinkStatus[]): Result of HTTP status checks for all unique links discovered (including the pages themselves).
|
||||||
|
- **url** (string): The checked URL.
|
||||||
|
- **statusCode** (number): HTTP status code (0 if request failed before a response was received).
|
||||||
|
- **ok** (boolean): Convenience flag, true when `200 ≤ statusCode < 400` and no error occurred.
|
||||||
|
- **error** (string; optional): Error string when a request failed or there was another client error.
|
||||||
|
|
||||||
|
- **pageOutlinks** (object map<string,string[]>): For each crawled page URL, the list of normalized outgoing links (internal and external).
|
||||||
|
|
||||||
|
- **linkSources** (object map<string,string[]>): Inverse index: for each discovered link URL, the list of page URLs where it appeared.
|
||||||
|
|
||||||
|
- **missingInSitemap** (string[]; optional): URLs that were crawled but not present in the sitemap.
|
||||||
|
|
||||||
|
- **inSitemapNotCrawled** (string[]; optional): URLs present in the sitemap that were not crawled (e.g., due to depth limits or off-host rules).
|
||||||
|
|
||||||
|
### Notes
|
||||||
|
|
||||||
|
- URLs are normalized and deduplicated during crawl.
|
||||||
|
- Content-type filtering: only `text/html` pages are parsed for outlinks.
|
||||||
|
- Sitemap fetching is best-effort; absence is not treated as an error.
|
||||||
|
- The JSON lists are sorted to produce stable outputs across runs.
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,290 @@
|
||||||
|
{
|
||||||
|
"target": "https://titan-training.ca",
|
||||||
|
"crawledUrls": [
|
||||||
|
"https://titan-training.ca",
|
||||||
|
"https://titan-training.ca/",
|
||||||
|
"https://titan-training.ca/.",
|
||||||
|
"https://titan-training.ca/cdn-cgi/l/email-protection",
|
||||||
|
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
|
||||||
|
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
|
||||||
|
"https://titan-training.ca/products-list",
|
||||||
|
"https://titan-training.ca/titan-training.ca"
|
||||||
|
],
|
||||||
|
"sitemapUrls": [
|
||||||
|
"https://titan-training.ca/home",
|
||||||
|
"https://titan-training.ca/test_path?item=123"
|
||||||
|
],
|
||||||
|
"linkStatuses": [
|
||||||
|
{
|
||||||
|
"url": "https://titan-training.ca/products-list",
|
||||||
|
"statusCode": 200,
|
||||||
|
"ok": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://titan-training.ca/",
|
||||||
|
"statusCode": 200,
|
||||||
|
"ok": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.cloudflare.com/5xx-error-landing",
|
||||||
|
"statusCode": 200,
|
||||||
|
"ok": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://titan-training.ca",
|
||||||
|
"statusCode": 200,
|
||||||
|
"ok": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.facebook.com/titantrainingkw",
|
||||||
|
"statusCode": 200,
|
||||||
|
"ok": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public",
|
||||||
|
"statusCode": 200,
|
||||||
|
"ok": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://developers.cloudflare.com/waf/tools/scrape-shield/email-address-obfuscation",
|
||||||
|
"statusCode": 200,
|
||||||
|
"ok": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://developers.cloudflare.com/fundamentals/setup/account/create-account",
|
||||||
|
"statusCode": 200,
|
||||||
|
"ok": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
|
||||||
|
"statusCode": 200,
|
||||||
|
"ok": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.cloudflare.com/sign-up?utm_source=email_protection",
|
||||||
|
"statusCode": 403,
|
||||||
|
"ok": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://titan-training.ca/cdn-cgi/l/email-protection",
|
||||||
|
"statusCode": 200,
|
||||||
|
"ok": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://titan-training.ca/titan-training.ca",
|
||||||
|
"statusCode": 200,
|
||||||
|
"ok": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://titan-training.ca/product-details/product/681331db52e2115c63435275",
|
||||||
|
"statusCode": 200,
|
||||||
|
"ok": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://titan-training.ca/.",
|
||||||
|
"statusCode": 200,
|
||||||
|
"ok": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.google.com/maps/search?api=1\u0026query=Google\u0026query_place_id=ChIJwzSnW430K4gRU8zOBshqKAg",
|
||||||
|
"statusCode": 404,
|
||||||
|
"ok": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.instagram.com/titan__training",
|
||||||
|
"statusCode": 200,
|
||||||
|
"ok": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://titan-training.ca/product-details/product/titan-training.ca",
|
||||||
|
"statusCode": 200,
|
||||||
|
"ok": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"pageOutlinks": {
|
||||||
|
"https://titan-training.ca": [
|
||||||
|
"https://titan-training.ca/.",
|
||||||
|
"https://titan-training.ca/cdn-cgi/l/email-protection",
|
||||||
|
"https://titan-training.ca/products-list",
|
||||||
|
"https://titan-training.ca/titan-training.ca",
|
||||||
|
"https://www.facebook.com/titantrainingkw",
|
||||||
|
"https://www.google.com/maps/search?api=1\u0026query=Google\u0026query_place_id=ChIJwzSnW430K4gRU8zOBshqKAg",
|
||||||
|
"https://www.instagram.com/titan__training",
|
||||||
|
"https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public"
|
||||||
|
],
|
||||||
|
"https://titan-training.ca/": [
|
||||||
|
"https://titan-training.ca/",
|
||||||
|
"https://titan-training.ca/cdn-cgi/l/email-protection",
|
||||||
|
"https://titan-training.ca/products-list",
|
||||||
|
"https://titan-training.ca/titan-training.ca",
|
||||||
|
"https://www.facebook.com/titantrainingkw",
|
||||||
|
"https://www.google.com/maps/search?api=1\u0026query=Google\u0026query_place_id=ChIJwzSnW430K4gRU8zOBshqKAg",
|
||||||
|
"https://www.instagram.com/titan__training",
|
||||||
|
"https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public"
|
||||||
|
],
|
||||||
|
"https://titan-training.ca/.": [
|
||||||
|
"https://titan-training.ca/",
|
||||||
|
"https://titan-training.ca/cdn-cgi/l/email-protection",
|
||||||
|
"https://titan-training.ca/products-list",
|
||||||
|
"https://titan-training.ca/titan-training.ca",
|
||||||
|
"https://www.facebook.com/titantrainingkw",
|
||||||
|
"https://www.google.com/maps/search?api=1\u0026query=Google\u0026query_place_id=ChIJwzSnW430K4gRU8zOBshqKAg",
|
||||||
|
"https://www.instagram.com/titan__training",
|
||||||
|
"https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public"
|
||||||
|
],
|
||||||
|
"https://titan-training.ca/cdn-cgi/l/email-protection": [
|
||||||
|
"https://developers.cloudflare.com/fundamentals/setup/account/create-account",
|
||||||
|
"https://developers.cloudflare.com/waf/tools/scrape-shield/email-address-obfuscation",
|
||||||
|
"https://www.cloudflare.com/5xx-error-landing",
|
||||||
|
"https://www.cloudflare.com/sign-up?utm_source=email_protection"
|
||||||
|
],
|
||||||
|
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8": [
|
||||||
|
"https://titan-training.ca/",
|
||||||
|
"https://titan-training.ca/cdn-cgi/l/email-protection",
|
||||||
|
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
|
||||||
|
"https://titan-training.ca/product-details/product/titan-training.ca",
|
||||||
|
"https://titan-training.ca/products-list",
|
||||||
|
"https://www.facebook.com/titantrainingkw",
|
||||||
|
"https://www.instagram.com/titan__training",
|
||||||
|
"https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public"
|
||||||
|
],
|
||||||
|
"https://titan-training.ca/product-details/product/681331db52e2115c63435275": [
|
||||||
|
"https://titan-training.ca/",
|
||||||
|
"https://titan-training.ca/cdn-cgi/l/email-protection",
|
||||||
|
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
|
||||||
|
"https://titan-training.ca/product-details/product/titan-training.ca",
|
||||||
|
"https://titan-training.ca/products-list",
|
||||||
|
"https://www.facebook.com/titantrainingkw",
|
||||||
|
"https://www.instagram.com/titan__training",
|
||||||
|
"https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public"
|
||||||
|
],
|
||||||
|
"https://titan-training.ca/products-list": [
|
||||||
|
"https://titan-training.ca/",
|
||||||
|
"https://titan-training.ca/cdn-cgi/l/email-protection",
|
||||||
|
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
|
||||||
|
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
|
||||||
|
"https://titan-training.ca/products-list",
|
||||||
|
"https://titan-training.ca/titan-training.ca",
|
||||||
|
"https://www.facebook.com/titantrainingkw",
|
||||||
|
"https://www.instagram.com/titan__training",
|
||||||
|
"https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public"
|
||||||
|
],
|
||||||
|
"https://titan-training.ca/titan-training.ca": [
|
||||||
|
"https://titan-training.ca/cdn-cgi/l/email-protection",
|
||||||
|
"https://titan-training.ca/products-list",
|
||||||
|
"https://titan-training.ca/titan-training.ca",
|
||||||
|
"https://www.facebook.com/titantrainingkw",
|
||||||
|
"https://www.google.com/maps/search?api=1\u0026query=Google\u0026query_place_id=ChIJwzSnW430K4gRU8zOBshqKAg",
|
||||||
|
"https://www.instagram.com/titan__training",
|
||||||
|
"https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"linkSources": {
|
||||||
|
"https://developers.cloudflare.com/fundamentals/setup/account/create-account": [
|
||||||
|
"https://titan-training.ca/cdn-cgi/l/email-protection"
|
||||||
|
],
|
||||||
|
"https://developers.cloudflare.com/waf/tools/scrape-shield/email-address-obfuscation": [
|
||||||
|
"https://titan-training.ca/cdn-cgi/l/email-protection"
|
||||||
|
],
|
||||||
|
"https://titan-training.ca/": [
|
||||||
|
"https://titan-training.ca/",
|
||||||
|
"https://titan-training.ca/.",
|
||||||
|
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
|
||||||
|
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
|
||||||
|
"https://titan-training.ca/products-list"
|
||||||
|
],
|
||||||
|
"https://titan-training.ca/.": [
|
||||||
|
"https://titan-training.ca"
|
||||||
|
],
|
||||||
|
"https://titan-training.ca/cdn-cgi/l/email-protection": [
|
||||||
|
"https://titan-training.ca/",
|
||||||
|
"https://titan-training.ca/.",
|
||||||
|
"https://titan-training.ca/titan-training.ca",
|
||||||
|
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
|
||||||
|
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
|
||||||
|
"https://titan-training.ca",
|
||||||
|
"https://titan-training.ca/products-list"
|
||||||
|
],
|
||||||
|
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8": [
|
||||||
|
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
|
||||||
|
"https://titan-training.ca/products-list"
|
||||||
|
],
|
||||||
|
"https://titan-training.ca/product-details/product/681331db52e2115c63435275": [
|
||||||
|
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
|
||||||
|
"https://titan-training.ca/products-list"
|
||||||
|
],
|
||||||
|
"https://titan-training.ca/product-details/product/titan-training.ca": [
|
||||||
|
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
|
||||||
|
"https://titan-training.ca/product-details/product/681331db52e2115c63435275"
|
||||||
|
],
|
||||||
|
"https://titan-training.ca/products-list": [
|
||||||
|
"https://titan-training.ca/",
|
||||||
|
"https://titan-training.ca/.",
|
||||||
|
"https://titan-training.ca/titan-training.ca",
|
||||||
|
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
|
||||||
|
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
|
||||||
|
"https://titan-training.ca",
|
||||||
|
"https://titan-training.ca/products-list"
|
||||||
|
],
|
||||||
|
"https://titan-training.ca/titan-training.ca": [
|
||||||
|
"https://titan-training.ca/",
|
||||||
|
"https://titan-training.ca/.",
|
||||||
|
"https://titan-training.ca/titan-training.ca",
|
||||||
|
"https://titan-training.ca",
|
||||||
|
"https://titan-training.ca/products-list"
|
||||||
|
],
|
||||||
|
"https://www.cloudflare.com/5xx-error-landing": [
|
||||||
|
"https://titan-training.ca/cdn-cgi/l/email-protection"
|
||||||
|
],
|
||||||
|
"https://www.cloudflare.com/sign-up?utm_source=email_protection": [
|
||||||
|
"https://titan-training.ca/cdn-cgi/l/email-protection"
|
||||||
|
],
|
||||||
|
"https://www.facebook.com/titantrainingkw": [
|
||||||
|
"https://titan-training.ca/",
|
||||||
|
"https://titan-training.ca/.",
|
||||||
|
"https://titan-training.ca/titan-training.ca",
|
||||||
|
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
|
||||||
|
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
|
||||||
|
"https://titan-training.ca",
|
||||||
|
"https://titan-training.ca/products-list"
|
||||||
|
],
|
||||||
|
"https://www.google.com/maps/search?api=1\u0026query=Google\u0026query_place_id=ChIJwzSnW430K4gRU8zOBshqKAg": [
|
||||||
|
"https://titan-training.ca/",
|
||||||
|
"https://titan-training.ca/.",
|
||||||
|
"https://titan-training.ca/titan-training.ca",
|
||||||
|
"https://titan-training.ca"
|
||||||
|
],
|
||||||
|
"https://www.instagram.com/titan__training": [
|
||||||
|
"https://titan-training.ca/",
|
||||||
|
"https://titan-training.ca/.",
|
||||||
|
"https://titan-training.ca/titan-training.ca",
|
||||||
|
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
|
||||||
|
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
|
||||||
|
"https://titan-training.ca",
|
||||||
|
"https://titan-training.ca/products-list"
|
||||||
|
],
|
||||||
|
"https://www.youtube.com/channel/UCOtL1D3s3fBxHJLAyF5kNRA/featured?view_as=public": [
|
||||||
|
"https://titan-training.ca/",
|
||||||
|
"https://titan-training.ca/.",
|
||||||
|
"https://titan-training.ca/titan-training.ca",
|
||||||
|
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
|
||||||
|
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
|
||||||
|
"https://titan-training.ca",
|
||||||
|
"https://titan-training.ca/products-list"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"missingInSitemap": [
|
||||||
|
"https://titan-training.ca",
|
||||||
|
"https://titan-training.ca/",
|
||||||
|
"https://titan-training.ca/.",
|
||||||
|
"https://titan-training.ca/cdn-cgi/l/email-protection",
|
||||||
|
"https://titan-training.ca/product-details/product/681330e25a7661691fe205c8",
|
||||||
|
"https://titan-training.ca/product-details/product/681331db52e2115c63435275",
|
||||||
|
"https://titan-training.ca/products-list",
|
||||||
|
"https://titan-training.ca/titan-training.ca"
|
||||||
|
],
|
||||||
|
"inSitemapNotCrawled": [
|
||||||
|
"https://titan-training.ca/home",
|
||||||
|
"https://titan-training.ca/test_path?item=123"
|
||||||
|
]
|
||||||
|
}
|
Binary file not shown.
Loading…
Reference in New Issue