256 lines
7.1 KiB
Go
256 lines
7.1 KiB
Go
package report
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"net/url"
|
|
"sort"
|
|
"strings"
|
|
|
|
"urlcrawler/internal/linkcheck"
|
|
)
|
|
|
|
type Report struct {
|
|
Target string `json:"target"`
|
|
CrawledURLs []string `json:"crawledUrls"`
|
|
SitemapURLs []string `json:"sitemapUrls,omitempty"`
|
|
CrawlErrors map[string]string `json:"crawlErrors,omitempty"`
|
|
LinkStatuses []linkcheck.LinkStatus `json:"linkStatuses"`
|
|
PageOutlinks map[string][]string `json:"pageOutlinks"`
|
|
LinkSources map[string][]string `json:"linkSources"`
|
|
MissingInSitemap []string `json:"missingInSitemap,omitempty"`
|
|
InSitemapNotCrawled []string `json:"inSitemapNotCrawled,omitempty"`
|
|
Metadata Metadata `json:"metadata"`
|
|
Params Params `json:"params"`
|
|
Stats Stats `json:"stats"`
|
|
ReportSummary string `json:"reportSummary,omitempty"`
|
|
TopExternalDomains []DomainCount `json:"topExternalDomains,omitempty"`
|
|
BrokenSample []linkcheck.LinkStatus `json:"brokenSample,omitempty"`
|
|
BrokenByDomain []DomainCount `json:"brokenByDomain,omitempty"`
|
|
Pages map[string]PageMeta `json:"pages"`
|
|
DepthDistribution map[int]int `json:"depthDistribution"`
|
|
Robots RobotsSummary `json:"robots"`
|
|
}
|
|
|
|
type Metadata struct {
|
|
StartedAt string `json:"startedAt"` // RFC3339
|
|
FinishedAt string `json:"finishedAt"` // RFC3339
|
|
DurationMs int64 `json:"durationMs"`
|
|
}
|
|
|
|
type Params struct {
|
|
MaxDepth int `json:"maxDepth"`
|
|
Concurrency int `json:"concurrency"`
|
|
TimeoutMs int64 `json:"timeoutMs"`
|
|
UserAgent string `json:"userAgent"`
|
|
SameHostOnly bool `json:"sameHostOnly"`
|
|
}
|
|
|
|
type Stats struct {
|
|
OK int `json:"ok"`
|
|
Broken int `json:"broken"`
|
|
Status2xx int `json:"status2xx"`
|
|
Status3xx int `json:"status3xx"`
|
|
Status4xx int `json:"status4xx"`
|
|
Status5xx int `json:"status5xx"`
|
|
StatusOther int `json:"statusOther"`
|
|
}
|
|
|
|
type DomainCount struct {
|
|
Domain string `json:"domain"`
|
|
Count int `json:"count"`
|
|
}
|
|
|
|
type PageMeta struct {
|
|
Title string `json:"title"`
|
|
ResponseTimeMs int64 `json:"responseTimeMs"`
|
|
ContentLength int `json:"contentLength"`
|
|
Depth int `json:"depth"`
|
|
}
|
|
|
|
type RobotsSummary struct {
|
|
Present bool `json:"present"`
|
|
FetchedAt string `json:"fetchedAt,omitempty"`
|
|
}
|
|
|
|
func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}, meta Metadata, params Params, pages map[string]PageMeta, robots RobotsSummary) Report {
|
|
crawledList := sanitizeURLs(keys(crawled))
|
|
sitemapList := sanitizeURLs(keys(sitemap))
|
|
crawlErrMap := make(map[string]string, len(crawlErrs))
|
|
for k, v := range crawlErrs {
|
|
crawlErrMap[k] = v.Error()
|
|
}
|
|
|
|
missing := difference(crawled, sitemap)
|
|
missingList := sanitizeURLs(keys(missing))
|
|
inSmNotCrawled := difference(sitemap, crawled)
|
|
inSmNotCrawledList := sanitizeURLs(keys(inSmNotCrawled))
|
|
|
|
pageOut := make(map[string][]string, len(outlinks))
|
|
linkSrc := make(map[string][]string)
|
|
for page, set := range outlinks {
|
|
lst := sanitizeURLs(keys(set))
|
|
pageOut[page] = lst
|
|
for _, u := range lst {
|
|
linkSrc[u] = append(linkSrc[u], page)
|
|
}
|
|
}
|
|
|
|
// Compute simple status histogram
|
|
var st Stats
|
|
for _, ls := range check.Statuses {
|
|
if ls.OK {
|
|
st.OK++
|
|
} else {
|
|
st.Broken++
|
|
}
|
|
switch {
|
|
case ls.StatusCode >= 200 && ls.StatusCode < 300:
|
|
st.Status2xx++
|
|
case ls.StatusCode >= 300 && ls.StatusCode < 400:
|
|
st.Status3xx++
|
|
case ls.StatusCode >= 400 && ls.StatusCode < 500:
|
|
st.Status4xx++
|
|
case ls.StatusCode >= 500 && ls.StatusCode < 600:
|
|
st.Status5xx++
|
|
default:
|
|
st.StatusOther++
|
|
}
|
|
}
|
|
|
|
// Derived summaries
|
|
tHost := hostLower(target)
|
|
extCounts := map[string]int{}
|
|
brokenByDomain := map[string]int{}
|
|
var brokenSample []linkcheck.LinkStatus
|
|
for _, ls := range check.Statuses {
|
|
h := hostLower(ls.URL)
|
|
if h != "" && !strings.EqualFold(h, tHost) {
|
|
extCounts[h]++
|
|
}
|
|
if !ls.OK {
|
|
brokenByDomain[h]++
|
|
if len(brokenSample) < 10 {
|
|
brokenSample = append(brokenSample, ls)
|
|
}
|
|
}
|
|
}
|
|
topExternal := mapToSortedSlice(extCounts)
|
|
brokenBy := mapToSortedSlice(brokenByDomain)
|
|
|
|
// Depth distribution
|
|
depthDist := make(map[int]int)
|
|
for _, pm := range pages {
|
|
depthDist[pm.Depth]++
|
|
}
|
|
|
|
summary := fmt.Sprintf("crawled=%d sitemap=%d links=%d ok=%d broken=%d",
|
|
len(crawledList), len(sitemapList), len(check.Statuses), st.OK, st.Broken)
|
|
|
|
return Report{
|
|
Target: target,
|
|
CrawledURLs: crawledList,
|
|
SitemapURLs: sitemapList,
|
|
CrawlErrors: crawlErrMap,
|
|
LinkStatuses: check.Statuses,
|
|
PageOutlinks: pageOut,
|
|
LinkSources: linkSrc,
|
|
MissingInSitemap: missingList,
|
|
InSitemapNotCrawled: inSmNotCrawledList,
|
|
Metadata: meta,
|
|
Params: params,
|
|
Stats: st,
|
|
ReportSummary: summary,
|
|
TopExternalDomains: topExternal,
|
|
BrokenSample: brokenSample,
|
|
BrokenByDomain: brokenBy,
|
|
Pages: pages,
|
|
DepthDistribution: depthDist,
|
|
Robots: robots,
|
|
}
|
|
}
|
|
|
|
func PrintText(w io.Writer, r Report) {
|
|
fmt.Fprintf(w, "Target: %s\n\n", r.Target)
|
|
fmt.Fprintf(w, "Crawled URLs: %d\n", len(r.CrawledURLs))
|
|
fmt.Fprintf(w, "Sitemap URLs: %d\n", len(r.SitemapURLs))
|
|
fmt.Fprintf(w, "Links checked: %d\n", len(r.LinkStatuses))
|
|
fmt.Fprintf(w, "Missing in sitemap: %d\n", len(r.MissingInSitemap))
|
|
fmt.Fprintf(w, "In sitemap not crawled: %d\n\n", len(r.InSitemapNotCrawled))
|
|
|
|
// Keep text output concise; details available in JSON
|
|
}
|
|
|
|
func keys[T comparable](m map[T]struct{}) []T {
|
|
res := make([]T, 0, len(m))
|
|
for k := range m {
|
|
res = append(res, k)
|
|
}
|
|
sort.Slice(res, func(i, j int) bool { return asString(res[i]) < asString(res[j]) })
|
|
return res
|
|
}
|
|
|
|
func asString[T any](v T) string {
|
|
switch x := any(v).(type) {
|
|
case string:
|
|
return x
|
|
default:
|
|
return fmt.Sprintf("%v", v)
|
|
}
|
|
}
|
|
|
|
func difference(a, b map[string]struct{}) map[string]struct{} {
|
|
res := make(map[string]struct{})
|
|
for k := range a {
|
|
if _, ok := b[k]; !ok {
|
|
res[k] = struct{}{}
|
|
}
|
|
}
|
|
return res
|
|
}
|
|
|
|
// sanitizeURLs normalizes small variants like trailing "/." to "/" for consistency.
|
|
func sanitizeURLs(urls []string) []string {
|
|
out := make([]string, 0, len(urls))
|
|
for _, u := range urls {
|
|
out = append(out, sanitizeURL(u))
|
|
}
|
|
sort.Strings(out)
|
|
return out
|
|
}
|
|
|
|
func sanitizeURL(u string) string {
|
|
if len(u) >= 2 && u[len(u)-2:] == "/." {
|
|
return u[:len(u)-1]
|
|
}
|
|
return u
|
|
}
|
|
|
|
func hostLower(raw string) string {
|
|
u, err := url.Parse(raw)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
return strings.ToLower(u.Host)
|
|
}
|
|
|
|
func mapToSortedSlice(m map[string]int) []DomainCount {
|
|
if len(m) == 0 {
|
|
return nil
|
|
}
|
|
out := make([]DomainCount, 0, len(m))
|
|
for k, v := range m {
|
|
out = append(out, DomainCount{Domain: k, Count: v})
|
|
}
|
|
sort.Slice(out, func(i, j int) bool {
|
|
if out[i].Count == out[j].Count {
|
|
return out[i].Domain < out[j].Domain
|
|
}
|
|
return out[i].Count > out[j].Count
|
|
})
|
|
if len(out) > 10 {
|
|
out = out[:10]
|
|
}
|
|
return out
|
|
}
|