Colly 是一个简洁,灵活的爬虫框架,只包括一个核心,没有太多存储 方面的东西,定制起来也很方便,目前只支持单爬虫。

项目地址

Github: https://github.com/asciimoo/colly

如何你喜欢这个框架,可以star或者贡献代码

基本特性

有些特性是作者给出来的,有些是自己总结的:

  • 同步/异步支持
  • User Agent支持
  • 并行限制
  • Cookie支持
  • Post方法支持
  • Cache支持
  • Goquery Selector

基本框架信息

系统框架如下:

system structure

基本上colly只包含两层:

  1. 最低层就是HTTP底层代码的封装,其中这一层封装了Cache和HTTP方法
  2. Collector层提供了基本的爬虫方法,以及基本的控制接口。其中有有爬虫频率限制、方法回调

上面就是用户接口了,如果要实现并行的化,需要在用户逻辑代码里面实现,具体的代码可以参考示例:

package main

import (
	"fmt"

	"github.com/asciimoo/colly"
)

func main() {
	// Instantiate default collector
	c := colly.NewCollector()

	// MaxDepth is 2, so only the links on the scraped page
	// and links on those pages are visited
	c.MaxDepth = 2

	// On every a element which has href attribute call callback
	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
		link := e.Attr("href")
		// Print link
		fmt.Println(link)
		// Visit link found on page on a new thread
		go e.Request.Visit(link)
	})

	// Start scraping on https://en.wikipedia.org
	c.Visit("https://en.wikipedia.org/")
	// Wait until threads are finished
	c.Wait()
}

没有更多的功能,基本上给的例子可以囊括所有的方法,这是一个很简单的框架,主要借助了GoQuery库的优势,框架本身以简洁深得人心,框架自身的代码很简单。

栗子:煎蛋网段子抓取

煎蛋网-段子

package main

import (
	"fmt"
	"github.com/asciimoo/colly"
	"strconv"
	"github.com/PuerkitoBio/goquery"
	"time"
	"sync"
	"database/sql"
	_ "github.com/mattn/go-sqlite3"
)

// JianDan duan url template
const JIANDAN_DUAN_URL_TLP = "http://jandan.net/duan/page-%s"

type DBWriter struct {
	lock     *sync.Mutex
	DBfile   string
	instance *sql.DB
}

/**
	Init a new DB writer

	Param filename string: sqlite3 db writer
	Return *DBWriter: dbwriter instance
 */
func NewDBWriter(filename string) *DBWriter {
	db, err := sql.Open("sqlite3", filename)
	if err != nil {
		panic(err)
	}

	return &DBWriter{
		DBfile:   filename,
		lock:     &sync.Mutex{},
		instance: db,
	}
}

/**
	Recreate the table in dababase

 */
func (w *DBWriter) Reinit() {
	sql := `
		DROP TABLE IF EXISTS duanzi;

	    CREATE TABLE duanzi (
        duanzi TEXT NOT NULL,
        created BIGINT NULL);
	`

	_, err := w.instance.Exec(sql)
	w.CheckError(err)
}

/**
	Insert content into database

	Param duanzi string: duanzi
 */
func (w *DBWriter) Write(duanzi string) int64 {
	stmt, err := w.instance.Prepare("INSERT INTO duanzi(duanzi, created) VALUES (?, ?)")
	w.CheckError(err)

	w.lock.Lock()
	res, err := stmt.Exec(duanzi, time.Now().UnixNano())
	w.lock.Unlock()
	w.CheckError(err)

	id, err := res.LastInsertId()
	w.CheckError(err)

	return id
}

/**
	Check if err happens and exit

 */
func (w *DBWriter) CheckError(err error) {
	if err != nil {
		panic(err)
	}
}

/**
	Save data into sqlite 3


 */

/**
	filter for duanzi

	Param a1 string, like vote
	Param a2 string, unlike vote
	Return bool: true if condition is ok
 */
func duan_condition(a1 string, a2 string) bool {

	var a, b int

	if s1, err := strconv.Atoi(a1); err != nil {
		return false
	} else {
		a = s1
	}

	if s2, err := strconv.Atoi(a2); err != nil {
		return false
	} else {
		a = s2
	}
	return a > 2*b
}

/**
	find the max tab size of duan at jiandan.com

	Param selector *goquery.Selection: Go Query Selection, used as Jquery like way
	Return int: largest page size
 */
func find_largest_page(selector *goquery.Selection) int {
	page_str := selector.Find("span").First().Text()
	if page, err := strconv.Atoi(page_str[1:len(page_str)-1]); err == nil {
		return page
	}
	return 0
}

/**
	Generate Url template

	Param page string: current page size
	Return url string: visit url
 */
func generate_url(page string) string {
	return fmt.Sprintf(JIANDAN_DUAN_URL_TLP, page)
}

func main() {
	c := colly.NewCollector()
	c.AllowedDomains = []string{"jandan.net"}
	c.SetRequestTimeout(5 * time.Second)
	c.Limit(&colly.LimitRule{
		DomainGlob:  "*jiandan.net*",
		Parallelism: 5,
	})

	w := NewDBWriter("test.db")
	w.Reinit()

	var page_size int

	// find the page number
	c.OnHTML(".cp-pagenavi", func(e *colly.HTMLElement) {
		page_size = find_largest_page(e.DOM)
	})
	c.Visit("http://jandan.net/duan/")
	c.OnHTMLDetach(".cp-pagenavi")

	// scrape duanzi
	c.OnHTML(".row", func(e *colly.HTMLElement) {
		like_str := e.DOM.Find(".tucao-like-container>span").Text()
		unlike_str := e.DOM.Find(".tucao-unlike-container>span").Text()
		if duan_condition(like_str, unlike_str) {
			duanzi := e.DOM.Find(".text>p").Text()
			w.Write(duanzi)
		}
	})

	for i := page_size; i > 0; i-- {
		go c.Visit(generate_url(strconv.Itoa(i)))
	}

	c.Wait()
}