go 学习 3

1、正则表达式应用

package main

import "fmt"
import "regexp"

/*
正则表达式   能够匹配的字符串
abc        abc
a.c         .表示通配,除了\n以外所有的字符  axc  aac abc
a\.c        只能匹配a.c
a\dc        能匹配(0-9)的任意数字  a1c a2c a3c
abc*        能匹配前一个字符的0次或无限次   ab  abc abcc abccccccc
abc+        能匹配前一个字符的1次或无限次     abc abcc abccccccc
*/

var content1 string = "3.14 123224 .68  hello 1.0 abc 6666 888"

func main() {
    //通过一个正则表达式,生成一个正则匹配的句柄
    myexp := regexp.MustCompile(`\d+\.\d+`) //匹配 数字.数字

    //第二个参数表示匹配的个数,-1表示无限个,1,表示匹配一个
    resul := myexp.FindAllStringSubmatch(content1, -1)
    fmt.Printf("%+v\n", resul)
    fmt.Println("----------------")
    for _, mystr := range resul {
        fmt.Printf("%+v\n", mystr[0])
    }

}
package main

import "fmt"
import "regexp"

var str string = `
    <tile>标题</tile>
    <div>hahah</div>
    <div>xixi
    xixixi
    hahahah
    wawawa</div>
`

//<div>(.*?)</div>
/*
go语言中
顿号中:\n表示一个\字符,一个n字符
`
dabc
\n
\r
`
双引号中:\n表示换行,\表示转义
"
dbad
\n
\r
"
*/

func main() {
    //通过一个正则表达式,生成一个正则匹配的句柄
    //myexp := regexp.MustCompile(`<div>(.*?)</div>`) x*?  重复>=0次匹配x,越少越好(优先跳出重复)
    myexp := regexp.MustCompile(`<div>(?s:(.*?))</div>`)// s 让.可以匹配\n(默认关闭)
    /*
            (?flags:re)    设置re段的标志,不捕获的分组
                I              大小写敏感(默认关闭)
                m              ^和$在匹配文本开始和结尾之外,还可以匹配行首和行尾(默认开启)
                s              让.可以匹配\n(默认关闭)
                U              非贪婪的:交换x*和x*?、x+和x+?……的含义(默认关闭)
    */

    //第二个参数表示匹配的个数,-1表示无限个,1,表示匹配一个
    resul := myexp.FindAllStringSubmatch(str, -1)

    fmt.Printf("%+v\n", resul) //打印不出多行,因为有\n
    //(?s:(.*?))

    for _, text := range resul {
        fmt.Println("=================")
        fmt.Println(text[0]) //带div
        fmt.Println(text[1]) //不带div

    }

}

通过正则 爬取网页数据

package main

import "fmt"
import "strconv"
import "net/http"
import "github.com/axgle/mahonia"
import "strings"
import "io/ioutil"
import "os"
import "regexp"

type Spider struct {
    page int
}

func ConvertToString(src string, srcCode string, tagCode string) string {
    srcCoder := mahonia.NewDecoder(srcCode)
    srcResult := srcCoder.ConvertString(src)
    tagCoder := mahonia.NewDecoder(tagCode)
    _, cdata, _ := tagCoder.Translate([]byte(srcResult), true)
    result := string(cdata)
    return result
}
func httpGet(url string) (conter string, statusCode int) {
    resp, err := http.Get(url)
    if err != nil { //不等于空表示连接失败
        fmt.Println(err)
        conter = ""
        statusCode = -100
        return
    }

    defer resp.Body.Close()

    //从服务器读取数据
    data, err := ioutil.ReadAll(resp.Body)

    if err != nil { //读取数据失败
        fmt.Println(err)
        statusCode = resp.StatusCode //返回失败的编码

        return
    }
    var str string
    str = string(data) //把数据转换成string
    conter = ConvertToString(str, "gbk", "utf-8")
    statusCode = resp.StatusCode

    //fmt.Println(conter)

    return
}

//获取每个段子的数据
func (this *Spider) SpiderOneDz(url string) (dz_title string, dz_content string) {
    //连接每一个独立网页
    conter, rcode := httpGet(url)
    if rcode != 200 {
        fmt.Println("SpiderOneDz is httpGet err")
        return
    }
    //匹配标题
    title_exp := regexp.MustCompile(`<h1>(.*?)</h1>`)
    //得到标题
    titles := title_exp.FindAllStringSubmatch(conter, -1)

    for _, title := range titles {
        dz_title = title[1]
        break
    }
    //得到内容
    conter_exp := regexp.MustCompile(`<td><p>(?s:(.*?))</p></td>`)
    conters := conter_exp.FindAllStringSubmatch(conter, -1)

    for _, conter := range conters {
        dz_content = conter[1]
    }

    return
}

func (this *Spider) StroeDzToFile(title_slice []string, conter_slice []string) {
    filename := "mydata.txt"
    f, err := os.OpenFile(filename, os.O_CREATE|os.O_APPEND|os.O_RDWR, 0644)
    if err != nil {
        fmt.Println("open file err")
        return
    }
    defer f.Close()
    for i := 0; i < len(title_slice); i++ {
        f.WriteString("\n=============\n")
        f.WriteString(title_slice[i])
        f.WriteString("\n==============\n")
        f.WriteString(conter_slice[i])
    }
}

func (this *Spider) SpiderOnePage() {
    fmt.Println("正在爬取", this.page, "页")
    url := ""
    if this.page == 1 {
        url = "https://www.xxxx.com/dz/index.html"
    } else {
        url = "https://www.xxxx.com/dz/list_" + strconv.Itoa(this.page) + ".html"
    }

    fmt.Println(url)

    // 获取一页数据
    contert, rcode := httpGet(url)
    if rcode != 200 {
        fmt.Println("httpGet err")
        return
    }
    //筛选数据 获取每个段子的url
    dz_url_exp := regexp.MustCompile(`<h4> <a href="(.*?)"`)
    urls := dz_url_exp.FindAllStringSubmatch(contert, -1)

    var full_url string

    //开辟空间存储标题和内容
    title_slice := make([]string, 0)
    conter_slice := make([]string, 0)

    //每一页的段子
    for _, du_url := range urls {
        full_url = "https://www.xxxxx.com" + du_url[1]
        //该函数会返回标题和内容
        dz_title, dz_content := this.SpiderOneDz(full_url)

        // 剔除杂乱信息
        dz_content = strings.Replace(dz_content, "\r\n", "\n", -1)
        dz_content = strings.Replace(dz_content, "</p>", "", -1)

        fmt.Println("title===", dz_title)
        fmt.Println("content===", dz_content)

        title_slice = append(title_slice, dz_title)
        conter_slice = append(conter_slice, dz_content)
    }
    this.StroeDzToFile(title_slice, conter_slice)
    fmt.Println("代码结束")
}

func (this *Spider) doWork() {
    fmt.Println("开始爬取")

    var cmd string

    this.page = 1

    for {
        fmt.Println("请输入任意键,开始爬取下一页,输入exit 退出")
        fmt.Scanf("%s", &cmd)
        if cmd == "exit" {
            fmt.Println("exit")
            break
        }
        //开始爬取没一页的段子
        this.SpiderOnePage()
        this.page++
    }

}

func main() {
    sp := new(Spider)
    sp.doWork()
}

map 数据结构

package main

import "fmt"

func main() {
    //声明一个map 数据类型的变量  [key]value
    var mymap map[string]string //map 是一个空 指针
    if mymap == nil {
        fmt.Println("mymap is nil")
    }

    //给map地址就是给他空间
    mymap = make(map[string]string)
    if mymap != nil {
        fmt.Println("mymap is not nil")

    }
    mymap["aaa"] = "1111"
    mymap["bbb"] = "2222"
    mymap["ccc"] = "3333"
    for key, value := range mymap {
        fmt.Println(key)
        fmt.Println(value)
    }

    fmt.Println("-------------")
    for key, _ := range mymap {
        fmt.Println(mymap[key]) //打印key对应的值
    }
    fmt.Println("-------------")

    Student := map[string]interface{}{
        "Name": "xxx",
        "Age":  19,
    }

    for key, value := range Student {
        fmt.Println(key)
        fmt.Println(value)
    }
    fmt.Println("-------------")

    //  修改某个字段
    Student["Age"] = 29
    for _, value := range Student {

        fmt.Println(value)
    }
    //删除某个字段
    delete(Student, "Age")
    for key, _ := range Student {
        fmt.Println(key)
    }
}

go语言中的json解析

package main

import "fmt"

import "encoding/json"

type Movie struct {
    Title string
    Year  int
    Price int
}

var my_Year struct {
    Year  int
    Title string
}

var jsons []byte

func encoding() {
    mymovie := Movie{"西红柿首富", 2018, 50}
    json_str, err := json.Marshal(mymovie)
    if err != nil {
        fmt.Println("json.Marshal err")
        return
    }
    fmt.Printf("json_str %s\n", json_str)
    fmt.Println("---------")
    jsons = json_str

}

func decoding() {

    fmt.Printf("json_str %s\n", jsons)

    //1、把json解析的数据放到结构体变量中

    if err := json.Unmarshal(jsons, &my_Year); err == nil {
        fmt.Printf("%+v\n", my_Year)
    }
    fmt.Println("---------")
    //一次性解析全部的数据
    my_movie := Movie{}
    if err := json.Unmarshal(jsons, &my_movie); err == nil {
        //解析成功
        fmt.Printf("%+v\n", my_movie)
    }
    fmt.Println("---------")
    //2、把json解析的数据放到结map变量中
    my_map := map[string]interface{}{}

    if err := json.Unmarshal(jsons, &my_map); err == nil {
        fmt.Printf("%+v\n", my_map)

    }
    fmt.Println("---------")
    fmt.Println(my_map["Price"])
    fmt.Println(my_map["Title"])
}

func main() {
    encoding()
    decoding()
}

猜你喜欢

转载自blog.csdn.net/u014749668/article/details/81674484