首次 golang爬虫插件gocolly/colly 使用经历

栏目: Go · 发布时间: 5年前

内容简介:##涉及问题系列文章 https://www.cnblogs.com/majianguo/p/8146110.html1). 一个controller 可以定义多个OnHTML 回调函数 2). "div > p" --> div 的直接子元素, "div p" --> div 下的所有子元素

  • 各种包 例如:golang.org/x/net, golang/x/test 等的下载问题,有可能要翻墙
  • golang 与数据库交互
  • golang 文件读写
  • golang 多线程使用
  • golang 字符编码的转换
  • js 标签选择器


js 选择器的问题

1). 一个controller 可以定义多个OnHTML 回调函数 2). "div > p" --> div 的直接子元素, "div p" --> div 下的所有子元素


参考 : https://blog.csdn.net/webxscan/article/details/70174658

import (
    _ "github.com/go-sql-driver/mysql"

const (
    userName    = "root"
    password    = ""
    ip          = ""
    port        = "3306"
    dbName      = "dbName"

path := strings.Join([]string{userName, ":", password, "@tcp(",ip, ":", port, ")/", dbName, "?charset=utf8"}, "")

DB, _ := sql.Open("mysql", path)
if errConn := DB.Ping(); errConn != nil{
    fmt.Println("open database fail")
fmt.Println("connnect success")
defer DB.Close()

stmt, err := DB.Prepare("insert into user(name age) values(?, ?)")
if err != nil {
res, err := stmt.Exec("username", 18)
if err != nil {
// 获取新插入行的id


var fileName = "./flag.txt"
var file *os.File
var err error

func main() {
    file = openFile(fileName)
    writeFile(file, "keep coding!!")

func openFile(fileName string) *os.File {
    if checkFileIsExist(fileName) {
        file, err = os.OpenFile(fileName, os.O_APPEND, 0666)
    } else {
        file, err = os.Create(fileName)
    return file

func writeFile(file *os.File, content string) {
    writer := bufio.NewWriter(file)

func check(e error) {
    if e != nil {

func checkFileIsExist(filename string) bool {
    var exist = true
    if _, err := os.Stat(filename); os.IsNotExist(err) {
        exist = false
    return exist


抓取链家网数据时,网页本身是utf8编码,没有问题,可是抓取房天下的数据时,网页本身是gb2312,折腾了好一会, F**K

func coverGBKToUTF8(src string) string {
    // 网上搜有说要调用translate函数的,实测不用
    return mahonia.NewDecoder("gbk").ConvertString(src)



func main() {
    // 我们还可以创建一个带缓冲的channel:
    //c := make(chan int, 1024)
    // 从带缓冲的channel中读数据
    //for i:= range c {

    chs := make([] chan int, 10)
    for i := 0; i < 10; i ++ {
       go func(i int) {
           chs[i] = make(chan int)
           count(chs[i], i)
           //fmt.Println("run thread ", i) // 打印要放在向信道发消息之前
    for _, ch := range chs {
       value := <- ch
       fmt.Println(value, " thread done")
    fmt.Println("All done")

func count(ch chan int, i int) {
    ch <- i // 向信道发消息的过程一定要放在协程内部,才不会被主进程阻塞


package main

import (
    _ "github.com/go-sql-driver/mysql"

const (
    userName    = "root"
    password    = ""
    ip          = ""
    port        = "3306"
    dbName      = "dbName"

type XQinfo struct {
    name            string
    addr            string
    area            string
    postCode        string
    propertyRight   string
    propertyType    string
    buildTime       string
    developer       string
    buildType       string
    buildArea       string
    buildStruct     string
    floorSpace      string
    manageCompany   string
    greenRatio      string
    plotRatio       string
    propretyFee     string
    AdditionalInfo  string
    waterSupply     string
    heatSupply      string
    elecSupply      string
    gas             string
    security        string
    environment     string
    parkingSpace    string
    OtherInfo       string

var flagCh      = make(chan  int)
var count       = 1
var fileName    = "./flag.txt"
var file *os.File
var err error

func main() {
    path    := strings.Join([]string{userName, ":", password, "@tcp(",ip, ":", port, ")/", dbName, "?charset=utf8"}, "")
    max     := 2
    file    = openFile(fileName)

    for i := 0; i < max; i ++  {
        for j := 1; j <= 10; j ++ {
            DB, _ := sql.Open("mysql", path)
            if errConn := DB.Ping(); errConn != nil{
                fmt.Println("open database fail")
            fmt.Println("connnect success")
            defer DB.Close()

            link := "http://tj.esf.fang.com/housing/__0_0_0_0_" + strconv.Itoa(i * 10 + j) + "_0_0_0/"
            go work(link, DB, i * 10 + j)

    for {
        <- flagCh
        if count < max * 10 {
            fmt.Println("<- receive the " + strconv.Itoa(count) + " thread ending flag")
        } else {

        fmt.Println("All "+ strconv.Itoa(count) + " has done")

    defer file.Close()

func openFile(fileName string) *os.File {
    if checkFileIsExist(fileName) {
        file, err = os.OpenFile(fileName, os.O_APPEND, 0666)
    } else {
        file, err = os.Create(fileName)
    return file

func writeFile(file *os.File, content string) {
    writer := bufio.NewWriter(file)

func check(e error) {
    if e != nil {

func checkFileIsExist(filename string) bool {
    var exist = true
    if _, err := os.Stat(filename); os.IsNotExist(err) {
        exist = false
    return exist

func work(url string, DB *sql.DB, page int) {
    c                   := colly.NewCollector()
    detailLink          := c.Clone()
    detailController    := c.Clone()
    infos               := make([]XQinfo, 0)

    c.OnHTML(".plotListwrap > dt > a", func(e *colly.HTMLElement) {
        link    := e.Attr("href")

        fmt.Printf("link : %s \t", link)


    detailLink.OnHTML("#kesfxqxq_A01_03_01", func(e *colly.HTMLElement) {
        link    := e.ChildAttr("a", "href")
        //content := e.ChildText("a")

        //fmt.Printf("detial link : %s \t", link)
        //fmt.Printf("detial content : %s \t", coverGBKToUTF8(content))


    detailController.OnHTML("body", func(e *colly.HTMLElement) {

        info := XQinfo{}

        // 小区名称
        name := e.DOM.Find(".ceninfo_sq > h1 > a").Text()
        info.name = coverString(name)

        e.DOM.Find(".inforwrap").Each(func(i int, selection *goquery.Selection) {

            // 模块名称
            modelName := coverString(selection.Prev().Find("h3").Text())
            //fmt.Println("h3  -> ", modelName)

            switch modelName {
            case "基本信息":
               dealInfo(selection, &info)
            case "配套设施":
               dealInfo(selection, &info)
            case "周边信息":
               selection.Find("dl dt").Each(func(_ int, otherSelect *goquery.Selection) {
                   tab := coverString(otherSelect.Text())
                   del := strings.Index(tab, "本段合作")
                   if del == -1 {
                       info.OtherInfo = info.OtherInfo + tab + "|"

        infos = append(infos, info)

    c.OnRequest(func(r *colly.Request) {
        fmt.Println("Visiting", r.URL.String())

    c.OnScraped(func(response *colly.Response) {

        for _, info := range infos{
           insertDB(DB, info)

        fmt.Println("the "+ strconv.Itoa(page) + " thread sending end flag ->")

        flagCh <- 1


// 处理小区基础信息
func dealInfo(selection *goquery.Selection, info *XQinfo) {
    selection.Find("dl dd").Each(func(_ int, selectionbase *goquery.Selection) {
        setXQinfo(selectionbase, info)

    selection.Find("dl dt").Each(func(_ int, selectionbase *goquery.Selection) {
        setXQinfo(selectionbase, info)

func setXQinfo(selectionbase *goquery.Selection, info *XQinfo) {

    orgKey  := coverString(selectionbase.Find("strong").Text())
    index   := strings.Index(orgKey, ":")

    var key string
    if index > 0 {
        key = orgKey[:index]
    } else {
        key = orgKey

    var value string
    var fullValue string
    value,ok := selectionbase.Attr("title")
    if ok {
        value = coverString(value)
    } else {
        fullValue   = coverString(selectionbase.Text())
        value       = fullValue[strings.Index(fullValue, ":") + 3:]

    switch key {
    case "小区地址":
        info.addr           = value
    case "所属区域":
        info.area           = value
    case "邮编":
        info.postCode       = value
    case "产权描述":
        info.propertyRight  = value
    case "物业类别":
        info.propertyType   = value
    case "建筑年代":
        info.buildTime      = value
    case "开 发 商":
        info.developer      = value
    case "建筑结构":
        info.buildStruct    = value
    case "建筑类型":
        info.buildType      = value
    case "建筑面积":
        info.buildArea      = value
    case "占地面积":
        info.floorSpace     = value
    case "物业公司":
        info.manageCompany  = value
    case "绿 化 率":
        info.greenRatio     = value
    case "容 积 率":
        info.plotRatio      = value
    case "物 业 费":
        info.propretyFee    = value
    case "附加信息":
        info.AdditionalInfo = value
    case "供水":
        info.waterSupply    = value
    case "供暖":
        info.heatSupply     = value
    case "供电":
        info.elecSupply     = value
    case "燃气":
        info.gas            = value
    case "安全管理":
        info.security       = value
    case "卫生服务":
        info.environment    = value
    case "停 车 位":
        info.parkingSpace   = value

func coverGBKToUTF8(src string) string {
    return mahonia.NewDecoder("gbk").ConvertString(src)

func replaceNullHtml(src string) string {
    return strings.Replace(src, "聽", "", -1)

func coverString(src string) string {
    return replaceNullHtml(coverGBKToUTF8(src))

func insertDB(DB *sql.DB, info XQinfo) {
    t := reflect.TypeOf(info)
    v := reflect.ValueOf(info)

    sql1 := "insert into rx_xiaoqu("
    sql2 := ") values ("
    sql3 := ")"

    for i := 0; i < t.NumField(); i++ {

        sql1 = sql1 + t.Field(i).Name
        sql2 = sql2 + "'" + v.Field(i).String() + "'"

        if i != t.NumField() - 1 {
            sql1 = sql1 + ", "
            sql2 = sql2 + ", "

        //fmt.Printf("key -> %s, value -> %s", t.Field(i).Name, v.Field(i))
    //fmt.Println(sql1, sql2)

    stmt, err := DB.Prepare(sql1 + sql2 + sql3)
    if err != nil {
        fmt.Println(sql1 + sql2)
    res, err := stmt.Exec()
    if err != nil {

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持 码农网




