gosint-sitecrawl/internal/linkcheck/linkcheck.go

85 lines
2.0 KiB
Go

package linkcheck
import (
"context"
"net/http"
"sync"
)
type LinkStatus struct {
URL string `json:"url"`
StatusCode int `json:"statusCode"`
OK bool `json:"ok"`
Err string `json:"error,omitempty"`
}
type Results struct {
Statuses []LinkStatus `json:"statuses"`
}
func Check(ctx context.Context, urls map[string]struct{}, concurrency int, client *http.Client, userAgent string, showProgress bool, progressCallback func(bool)) Results {
var mu sync.Mutex
var statuses []LinkStatus
type job struct{ u string }
jobs := make(chan job, concurrency*2)
wg := sync.WaitGroup{}
worker := func() {
defer wg.Done()
for j := range jobs {
status, err := headOrGet(ctx, client, userAgent, j.u)
ls := LinkStatus{URL: j.u, StatusCode: status}
if err != nil {
ls.Err = err.Error()
}
ls.OK = err == nil && status < 400 && status >= 200
mu.Lock()
statuses = append(statuses, ls)
mu.Unlock()
if progressCallback != nil {
progressCallback(ls.OK)
}
}
}
for i := 0; i < concurrency; i++ {
wg.Add(1)
go worker()
}
for u := range urls {
jobs <- job{u: u}
}
close(jobs)
wg.Wait()
return Results{Statuses: statuses}
}
func headOrGet(ctx context.Context, client *http.Client, userAgent string, u string) (int, error) {
req, _ := http.NewRequestWithContext(ctx, http.MethodHead, u, nil)
req.Header.Set("User-Agent", userAgent)
resp, err := client.Do(req)
if err == nil {
defer resp.Body.Close()
if resp.StatusCode >= 200 && resp.StatusCode < 400 {
return resp.StatusCode, nil
}
// Some servers don't support HEAD properly; fall back to GET for 4xx/405
if resp.StatusCode != http.StatusMethodNotAllowed && resp.StatusCode < 500 {
return resp.StatusCode, nil
}
}
// Fallback GET
req2, _ := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
req2.Header.Set("User-Agent", userAgent)
resp2, err2 := client.Do(req2)
if err2 != nil {
return 0, err2
}
defer resp2.Body.Close()
return resp2.StatusCode, nil
}