Skip to content

Commit 4ab5111

Browse files
committed
mv nodes opt func to htmldoc
1 parent 1c72c21 commit 4ab5111

2 files changed

Lines changed: 45 additions & 46 deletions

File tree

internal/fetcher/links.go

Lines changed: 1 addition & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -2,62 +2,17 @@
22
package fetcher
33

44
import (
5-
"fmt"
65
"log"
7-
"net/http"
86
"net/url"
97
"regexp"
108
"strings"
119

1210
"github.com/wedojava/fetcher/internal/htmldoc"
1311
"github.com/wedojava/gears"
14-
"golang.org/x/net/html"
1512
)
1613

17-
// ExtractLinks makes an HTTP GET request to the specified URL, parses
18-
// the response as HTML, and returns the links in the HTML document.
19-
func ExtractLinks(str string) ([]string, error) {
20-
resp, err := http.Get(str)
21-
if err != nil {
22-
return nil, err
23-
}
24-
if resp.StatusCode != http.StatusOK {
25-
resp.Body.Close()
26-
return nil, fmt.Errorf("getting %s: %s", str, resp.Status)
27-
}
28-
doc, err := html.Parse(resp.Body)
29-
resp.Body.Close()
30-
if err != nil {
31-
return nil, fmt.Errorf("parsing %s as HTML: %v", str, err)
32-
}
33-
var links []string
34-
visitNode := func(n *html.Node) {
35-
// TODO: compress layers
36-
if n.Type == html.ElementNode && n.Data == "a" {
37-
for _, a := range n.Attr {
38-
if a.Key != "href" {
39-
continue
40-
}
41-
link, err := resp.Request.URL.Parse(a.Val)
42-
if err != nil {
43-
continue // ignore bad URLs
44-
}
45-
// append only the target website
46-
if strings.HasPrefix(a.Val, "http") && strings.Contains(a.Val, link.Hostname()) {
47-
links = append(links, link.String())
48-
} else if strings.HasPrefix(a.Val, "/") {
49-
links = append(links, link.String())
50-
}
51-
52-
}
53-
}
54-
}
55-
htmldoc.ForEachNode(doc, visitNode, nil)
56-
return links, nil
57-
}
58-
5914
func (f *Fetcher) SetLinks() error {
60-
links, err := ExtractLinks(f.Entrance.String())
15+
links, err := htmldoc.ExtractLinks(f.Entrance.String())
6116
if err != nil {
6217
log.Printf(`can't extract links from "%s": %s`, f.Entrance.String(), err)
6318
return err

internal/htmldoc/htmldoc.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@ package htmldoc
22

33
import (
44
"bytes"
5+
"fmt"
56
"io/ioutil"
67
"log"
78
"net/http"
89
"net/url"
10+
"strings"
911
"time"
1012

1113
"golang.org/x/net/html"
@@ -35,6 +37,48 @@ func GetRawAndDoc(url *url.URL, retryTimeout time.Duration) ([]byte, *html.Node,
3537
return nil, nil, nil
3638
}
3739

40+
// ExtractLinks makes an HTTP GET request to the specified URL, parses
41+
// the response as HTML, and returns the links in the HTML document.
42+
func ExtractLinks(weburl string) ([]string, error) {
43+
resp, err := http.Get(weburl)
44+
if err != nil {
45+
return nil, err
46+
}
47+
if resp.StatusCode != http.StatusOK {
48+
resp.Body.Close()
49+
return nil, fmt.Errorf("getting %s: %s", weburl, resp.Status)
50+
}
51+
doc, err := html.Parse(resp.Body)
52+
resp.Body.Close()
53+
if err != nil {
54+
return nil, fmt.Errorf("parsing %s as HTML: %v", weburl, err)
55+
}
56+
var links []string
57+
visitNode := func(n *html.Node) {
58+
// TODO: compress layers
59+
if n.Type == html.ElementNode && n.Data == "a" {
60+
for _, a := range n.Attr {
61+
if a.Key != "href" {
62+
continue
63+
}
64+
link, err := resp.Request.URL.Parse(a.Val)
65+
if err != nil {
66+
continue // ignore bad URLs
67+
}
68+
// append only the target website
69+
if strings.HasPrefix(a.Val, "http") && strings.Contains(a.Val, link.Hostname()) {
70+
links = append(links, link.String())
71+
} else if strings.HasPrefix(a.Val, "/") {
72+
links = append(links, link.String())
73+
}
74+
75+
}
76+
}
77+
}
78+
ForEachNode(doc, visitNode, nil)
79+
return links, nil
80+
}
81+
3882
func ElementsByTagName(doc *html.Node, name ...string) []*html.Node {
3983
var nodes []*html.Node
4084
if len(name) == 0 {

0 commit comments

Comments
 (0)