golang多任务爬虫:爬取爆照吧每个帖子第一页所有的照片

前言

  • 一直使用python写爬虫,一时心血来潮想用golong写个爬虫试试,于是我就试了试。
  • 由于没用过golang写过爬虫,所以对第三方库不太了解,所有的数据提取使用的是正则表达式。
  • 由于没有使用ip池、user-agent池、cookie池;所以我被反爬了,最多一次抓取1054张照片。
  • 使用go的协程,总结一下:快的一批。单任务爬取并下载完毕1054张照片大概需要1分多钟;多任务的话,一瞬间就可以了(毕竟300M宽带满跑)。
  • python多线程实现的话,也很快啊,但是比起go还是差了点(毕竟GIL-。-)。
  • 多任务版有两种形式,一个是channel阻塞实现,还有个是sync.WaitGroup实现(推荐)
  • github:代码

单任务版code

package main

import (
	"bufio"
	"fmt"
	"io"
	"net/http"
	"os"
	"path"
	"regexp"
	"strconv"
)

// Requirements: multitask crawl all the photos on the first page of each post

// Step: make a request, get the list page,
// Go to the details page and get the url, at the end of jpg
// Request the url, to get the resp, save file

// Tool function, which returns the content corresponding to url
func HandleUrl(url string) (Content string) {
	resp, _ := http.Get(url)
	defer resp.Body.Close()
	buf := make([]byte, 4*1024)
	for {
		n, _ := resp.Body.Read(buf)
		if n == 0 {
			break
		}
		Content += string(buf[:n])
	}
	return Content
}

// Tool function, saving pictures
func SaveImage(imageUrl string) {
	filePath := "E:\\spiderImages\\美女"+path.Base(imageUrl)
	f, _ := os.Create(filePath)
	resp, _ := http.Get(imageUrl)
	defer f.Close()
	defer resp.Body.Close()
	reader:=bufio.NewReaderSize(resp.Body,32 * 1024)
	writer := bufio.NewWriter(f)
	_,_=io.Copy(writer, reader)
	fmt.Println("图片保存完毕")
}

// Process each detail page and extract the url corresponding to the photo
func HandleDetail(detailUrl string) {
	detailContent := HandleUrl(detailUrl)
	reg := regexp.MustCompile("<img class=\"BDE_Image\" src=\".*?\" size=")
	imageTempSlice := reg.FindAllString(detailContent, -1)
	reg2 := regexp.MustCompile("http.*jpg")
	for _, i := range imageTempSlice {
		imgUrl := reg2.FindString(i)
		SaveImage(imgUrl)
	}
}

// Process each list page and extract the details page url
func HandleListContent(listContent string) {
	reg := regexp.MustCompile("<a rel=\"noreferrer\" href=\"/p/\\d{10}")
	resultSlice := reg.FindAllString(listContent, -1)
	reg2 := regexp.MustCompile("\\d{10}")
	seedUrl := "https://tieba.baidu.com/p/"
	for _, i := range resultSlice {
		HandleDetail(seedUrl + reg2.FindString(i))
	}
}

// Build the url for each list page
func runListUrl() {
	startUrl := "https://tieba.baidu.com/f?kw=%E7%88%86%E7%85%A7&ie=utf-8&pn="
	for i := 0; i < 150; i += 50 {
		fmt.Println(i)
		startUrl += strconv.Itoa(i)
		Content := HandleUrl(startUrl)
		HandleListContent(Content)
	}
}

func main() {
	runListUrl()
}

多任务版code

package main

import (
	"fmt"
	"net/http"
	"os"
	"path"
	"regexp"
	"strconv"
	"sync"
)

// Requirements: multitask crawl all the photos on the first page of each post

// Step: make a request, get the list page,
// Go to the details page and get the url, at the end of jpg
// Request the url, to get the resp, save file

var wg sync.WaitGroup


// Tool function, which returns the content corresponding to url
func HandleUrl(url string) (Content string) {
	resp, _ := http.Get(url)
	defer resp.Body.Close()
	buf := make([]byte, 4*1024)
	for {
		n, _ := resp.Body.Read(buf)
		if n == 0 {
			break
		}
		Content += string(buf[:n])
	}
	return Content
}

// Tool function, saving pictures
func SaveImage(imageUrl string) {
	resp, _ := http.Get(imageUrl)
	filePath := "E:\\spiderImages2\\美女"+path.Base(imageUrl)
	f, _ := os.Create(filePath)
	defer f.Close()
	defer resp.Body.Close()

	buf := make([]byte, 4*1024)
	for {
		n, _ := resp.Body.Read(buf)
		if n==0{
			fmt.Println("文件保存完毕")
			wg.Done()
			break
		}
		_, _ = f.Write(buf[:n])
	}
}

// Process each detail page and extract the url corresponding to the photo
func HandleDetail(detailUrl string) {
	detailContent := HandleUrl(detailUrl)
	reg := regexp.MustCompile("<img class=\"BDE_Image\" src=\".*?\" size=")
	imageTempSlice := reg.FindAllString(detailContent, -1)
	reg2 := regexp.MustCompile("http.*jpg")
	for _, i := range imageTempSlice {
		imgUrl := reg2.FindString(i)
		go SaveImage(imgUrl)
		wg.Add(1)
	}
	wg.Done()
}

// Process each list page and extract the details page url
func HandleListContent(listContent string) {
	reg := regexp.MustCompile("<a rel=\"noreferrer\" href=\"/p/\\d{10}")
	resultSlice := reg.FindAllString(listContent, -1)
	reg2 := regexp.MustCompile("\\d{10}")
	seedUrl := "https://tieba.baidu.com/p/"
	for _, i := range resultSlice {
		go HandleDetail(seedUrl + reg2.FindString(i))
		wg.Add(1)
	}
	wg.Done()
}

// Build the url for each list page
func runListUrl() {
	startUrl := "https://tieba.baidu.com/f?kw=%E7%88%86%E7%85%A7&ie=utf-8&pn="
	for i := 0; i < 150; i += 50 {
		startUrl += strconv.Itoa(i)
		Content := HandleUrl(startUrl)
		go HandleListContent(Content)
		wg.Add(1)
	}
	wg.Wait()
}

func main() {
	runListUrl()
}

效果

爬虫效果图

发布了55 篇原创文章 · 获赞 3 · 访问量 2720

猜你喜欢

转载自blog.csdn.net/rusi__/article/details/103843389
今日推荐