|
2 | 2 | package fetcher |
3 | 3 |
|
4 | 4 | import ( |
5 | | - "fmt" |
6 | 5 | "log" |
7 | | - "net/http" |
8 | 6 | "net/url" |
9 | 7 | "regexp" |
10 | 8 | "strings" |
11 | 9 |
|
12 | 10 | "github.com/wedojava/fetcher/internal/htmldoc" |
13 | 11 | "github.com/wedojava/gears" |
14 | | - "golang.org/x/net/html" |
15 | 12 | ) |
16 | 13 |
|
17 | | -// ExtractLinks makes an HTTP GET request to the specified URL, parses |
18 | | -// the response as HTML, and returns the links in the HTML document. |
19 | | -func ExtractLinks(str string) ([]string, error) { |
20 | | - resp, err := http.Get(str) |
21 | | - if err != nil { |
22 | | - return nil, err |
23 | | - } |
24 | | - if resp.StatusCode != http.StatusOK { |
25 | | - resp.Body.Close() |
26 | | - return nil, fmt.Errorf("getting %s: %s", str, resp.Status) |
27 | | - } |
28 | | - doc, err := html.Parse(resp.Body) |
29 | | - resp.Body.Close() |
30 | | - if err != nil { |
31 | | - return nil, fmt.Errorf("parsing %s as HTML: %v", str, err) |
32 | | - } |
33 | | - var links []string |
34 | | - visitNode := func(n *html.Node) { |
35 | | - // TODO: compress layers |
36 | | - if n.Type == html.ElementNode && n.Data == "a" { |
37 | | - for _, a := range n.Attr { |
38 | | - if a.Key != "href" { |
39 | | - continue |
40 | | - } |
41 | | - link, err := resp.Request.URL.Parse(a.Val) |
42 | | - if err != nil { |
43 | | - continue // ignore bad URLs |
44 | | - } |
45 | | - // append only the target website |
46 | | - if strings.HasPrefix(a.Val, "http") && strings.Contains(a.Val, link.Hostname()) { |
47 | | - links = append(links, link.String()) |
48 | | - } else if strings.HasPrefix(a.Val, "/") { |
49 | | - links = append(links, link.String()) |
50 | | - } |
51 | | - |
52 | | - } |
53 | | - } |
54 | | - } |
55 | | - htmldoc.ForEachNode(doc, visitNode, nil) |
56 | | - return links, nil |
57 | | -} |
58 | | - |
59 | 14 | func (f *Fetcher) SetLinks() error { |
60 | | - links, err := ExtractLinks(f.Entrance.String()) |
| 15 | + links, err := htmldoc.ExtractLinks(f.Entrance.String()) |
61 | 16 | if err != nil { |
62 | 17 | log.Printf(`can't extract links from "%s": %s`, f.Entrance.String(), err) |
63 | 18 | return err |
|
0 commit comments