获取网页内容 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 package mainimport ( "fmt" "io" "net/http" ) func main () { url := "https://httpbin.org/get" resp, err := http.Get(url) if err != nil { fmt.Println("请求失败:" , err) return } defer resp.Body.Close() body, err := io.ReadAll(resp.Body) if err != nil { fmt.Println("读取内容失败:" , err) return } fmt.Println("网页内容:" ) fmt.Println(string (body)) }
解析html 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 package mainimport ( "fmt" "net/http" "golang.org/x/net/html" ) func main () { resp, _ := http.Get("https://golang.org" ) defer resp.Body.Close() doc, err := html.Parse(resp.Body) if err != nil { fmt.Println("解析失败:" , err) return } var links []string var f func (*html.Node) f = func (n *html.Node) { if n.Type == html.ElementNode && n.Data == "a" { for _, attr := range n.Attr { if attr.Key == "href" { links = append (links, attr.Val) } } } for c := n.FirstChild; c != nil ; c = c.NextSibling { f(c) } } f(doc) fmt.Println("页面所有链接:" ) for _, link := range links { fmt.Println(link) } }
doc, err := html.Parse(resp.Body)的作用? 解析响应体中的HTML内容,返回一个HTML节点树的根节点doc。把网页的原始 HTML 代码,变成 Go 能看懂、能遍历、能查找的 DOM 树结构 。
原生解析Json 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 package mainimport ( "encoding/json" "fmt" ) type User struct { Name string `json:"name"` Age int `json:"age"` Email string `json:"email"` } func main () { jsonStr := `{"name":"张三","age":20,"email":"zhangsan@example.com"}` var user User err := json.Unmarshal([]byte (jsonStr), &user) if err != nil { fmt.Println("解析失败:" , err) return } fmt.Println("姓名:" , user.Name) fmt.Println("年龄:" , user.Age) fmt.Println("邮箱:" , user.Email) }
不确定json里有啥 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 package mainimport ( "encoding/json" "fmt" ) func main () { jsonStr := `{"name":"李四","age":25,"email":"lisi@example.com"}` var data map [string ]interface {} json.Unmarshal([]byte (jsonStr), &data) fmt.Println("name:" , data["name" ].(string )) fmt.Println("age:" , data["age" ].(float64 )) }
带上jwt 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 package mainimport ( "io" "net/http" ) func main () { url := "https://httpbin.org/get" req, _ := http.NewRequest("GET" , url, nil ) token := "你的token字符串" req.Header.Add("Authorization" , "Bearer " +token) client := &http.Client{} resp, _ := client.Do(req) defer resp.Body.Close() body, _ := io.ReadAll(resp.Body) println (string (body)) }
1 req.Header.Add(“Cookie”, “session=abc123; uid=10001”)
2 req.AddCookie(&http.Cookie{ Name: “session”, Value: “abc123”, })
1 2 3 4 5 6 7 8 9 10 11 req, _ := http.NewRequest("GET" , url, nil ) req.Header.Add("Authorization" , "Bearer your-token-here" ) req.AddCookie(&http.Cookie{Name: "session" , Value: "xxx" }) req.Header.Add("User-Agent" , "Mozilla/5.0" ) req.Header.Add("Referer" , "https://google.com" )
令牌桶 限流 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 package mainimport ( "fmt" "io" "math/rand" "net/http" "time" ) const maxConcurrency = 5 func main () { urls := []string { "https://www.baidu.com" , "https://www.qq.com" , "https://www.163.com" , "https://www.sina.com" , "https://www.zhihu.com" , "https://www.jd.com" , "https://www.taobao.com" , } tokenCh := make (chan struct {}, maxConcurrency) for _, u := range urls { tokenCh <- struct {}{} go func (url string ) { defer func () { <-tokenCh }() delay := time.Millisecond * time.Duration(rand.Intn(1500 )+500 ) time.Sleep(delay) fmt.Printf("开始爬取: %s,延时: %v\n" , url, delay) err := crawl(url) if err != nil { fmt.Printf("爬取失败: %s, err: %v\n" , url, err) } else { fmt.Printf("爬取成功: %s\n" , url) } }(u) } for i := 0 ; i < maxConcurrency; i++ { tokenCh <- struct {}{} } fmt.Println("所有任务完成!" ) } func crawl (url string ) error { client := http.Client{Timeout: 10 * time.Second} req, _ := http.NewRequest("GET" , url, nil ) req.Header.Add("User-Agent" , "Mozilla/5.0 (Windows NT 10.0; Win64; x64)" ) resp, err := client.Do(req) if err != nil { return err } defer resp.Body.Close() _, err = io.ReadAll(resp.Body) return err }
官方实现 golang.org/x/time/rate
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 package mainimport ( "context" "fmt" "io" "net/http" "sync" "time" "golang.org/x/time/rate" ) func main () { urls := []string { "https://www.baidu.com" , "https://www.qq.com" , "https://www.163.com" , } limiter := rate.NewLimiter(2 , 3 ) var wg sync.WaitGroup for _, u := range urls { wg.Add(1 ) go func (url string ) { defer wg.Done() err := limiter.Wait(context.Background()) if err != nil { fmt.Println("限流错误:" , err) return } crawl(url) fmt.Println("完成:" , url) }(u) } wg.Wait() fmt.Println("所有任务完成" ) } func crawl (url string ) error { client := http.Client{Timeout: 5 * time.Second} req, _ := http.NewRequest("GET" , url, nil ) req.Header.Add("User-Agent" , "Mozilla/5.0" ) resp, err := client.Do(req) if err != nil { return err } defer resp.Body.Close() _, err = io.ReadAll(resp.Body) return err }
等令牌不会造成负担吗? Go 的 goroutine(协程)被阻塞等待时,不占 OS 线程、不占 CPU、几乎不占内存!
UA池 每个请求随机换一个浏览器标识,不让网站发现你是固定 UA 的爬虫。
1 2 3 4 5 6 7 8 9 10 11 12 13 var uaList = []string { "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ... Chrome/120.0.0.0" , "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 ... Safari/537.36" , "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15" , } func randomUA () string { return uaList[rand.Intn(len (uaList))] } req.Header.Set("User-Agent" , randomUA())
代理IP池 每个请求随机换一个代理 IP,不让网站发现你是固定 IP 的爬虫。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 var proxyList = []string { "http://192.168.1.100:8888" , "http://192.168.1.101:8888" , "http://192.168.1.102:8888" , } func randomProxy () string { return proxyList[rand.Intn(len (proxyList))] } func newClientWithProxy () *http.Client { proxyURL, _ := url.Parse(randomProxy()) transport := &http.Transport{ Proxy: http.ProxyURL(proxyURL), } return &http.Client{ Transport: transport, Timeout: 10 * time.Second, } }
蘑菇代理、阿布云、快代理
Referer 链路模拟 爬首页 → Referer 为空 或 填搜索引擎 爬列表页 → Referer 填首页 爬详情页 → Referer 填列表页
1 2 3 4 5 6 7 8 9 访问首页: https://www.xxx.com → Referer: "" 访问列表页: https://www.xxx.com/list → Referer: "https://www.xxx.com" 访问详情页: https://www.xxx.com/detail/1 → Referer: "https://www.xxx.com/list"
1 2 3 4 5 6 req.Header.Set("Referer" , "https://www.xxx.com" ) req.Header.Set("Referer" , "https://www.xxx.com/list" )
不能每次请求 new 一个 http.Client 每次都创建新的 Transport → 每次都创建新连接池 → 每次都重建 TCP 连接 重 慢
正确例子 // 全局单例 var globalClient = &http.Client{ Timeout: 10 * time.Second, Transport: &http.Transport{ // 连接池配置… }, }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 package mainimport ( "net/http" "sync" "time" ) var ( spiderClient *http.Client once sync.Once ) func GetSpiderClient () *http.Client { once.Do(func () { spiderClient = &http.Client{ Timeout: 15 * time.Second, Transport: &http.Transport{ MaxIdleConns: 100 , MaxIdleConnsPerHost: 20 , IdleConnTimeout: 30 * time.Second, DialTimeout: 5 * time.Second, ResponseHeaderTimeout: 5 * time.Second, ForceAttemptHTTP2: false , }, } }) return spiderClient }
cookieJar :go原生的自动cookie管理 (我去,之前我还真不知道这个)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 package mainimport ( "fmt" "net/http" "net/http/cookiejar" "net/url" ) func main () { jar, _ := cookiejar.New(nil ) client := &http.Client{ Jar: jar, } loginUrl := "https://example.com/login" data := url.Values{"username" : {"user" }, "password" : {"123" }} client.PostForm(loginUrl, data) resp, _ := client.Get("https://example.com/profile" ) defer resp.Body.Close() fmt.Println("请求成功,已自动携带登录 Cookie" ) }
Transport应该全局单例 你问到了爬虫/接口请求最核心的痛点 !
我直接给你Go 里处理 JS 渲染、异步加载页面的所有实战方案 ,从简单到强大,你直接复制就能用 。
遇到 JS 渲染、异步加载的页面,直接用 http.Client 拿不到数据 方案 1:抓真实异步接口(最推荐、最快、最稳) 99% 的异步加载页面,数据都是通过 XHR / Fetch 接口加载的。
怎么做:
打开网页 F12 → Network → XHR / Fetch
找到真实数据接口
直接用 http.Client 请求这个接口
优点:
缺点:
方案 2:用 chromedp(控制 Chrome 无头浏览器)⭐⭐⭐⭐⭐ Go 最主流、最强大、企业级方案。 相当于用代码控制一个真正的 Chrome 浏览器。
安装 1 go get github.com/chromedp/chromedp
示例代码(直接运行) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 package mainimport ( "context" "log" "time" "github.com/chromedp/chromedp" ) func main () { ctx, cancel := chromedp.NewContext( context.Background(), chromedp.WithLogf(log.Printf), ) defer cancel() ctx, cancel = context.WithTimeout(ctx, 15 *time.Second) defer cancel() var html string err := chromedp.Run(ctx, chromedp.Navigate(`https://your-page.com` ), chromedp.WaitVisible(`#content` ), chromedp.OuterHTML(`html` , &html), ) if err != nil { log.Fatal(err) } log.Println(html) }
优点:
完全模拟浏览器
能处理登录、滑动、点击、JS 加密
稳定、生态成熟
缺点:
方案 3:用 go-rod(比 chromedp 更简单)⭐⭐⭐⭐ Go 界最简单好用 的浏览器控制库,语法超直观。
安装 1 go get github.com/go-rod/rod
示例代码(极简) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 package mainimport ( "fmt" "github.com/go-rod/rod" ) func main () { page := rod.New().MustConnect().MustPage("https://your-page.com" ) page.MustWait(`#content` ) html := page.MustHTML() fmt.Println(html) }
优点:
方案 4:用 playwright-go(微软官方)⭐⭐⭐ playwright 是最强自动化工具,Go 只是其中一个语言 binding。
支持:Chrome、Firefox、Safari、Android、iOS…
安装 1 go get github.com/playwright-community/playwright-go
优点:
功能最全面
适合超级复杂网站
能录屏、截图、抓网络请求
缺点:
”让client sleep等加载完后再拉html也是一样的吧?” http.Client 只是个下载 HTML 的工具,它:
不能渲染 JS
不能处理异步加载的资源
不能模拟浏览器行为
指数退避 每次重试,等待时间翻倍。