通过python获取论坛用户信息的demo

今天写一个通过python获取论坛用户信息的demo

有朋友会问,这玩意有啥用么?用处老大了,举个栗子,通过获取到用户的信息与本地库比对,就会获得N多账号密码,不多解释

爬取一个CTF的站吧,https://www.bugku.com/

先分析  1.会员数量,看图说话,明显官网提供的数据不对,自己抓取吧。

2.抓取会员用户名、id、用户组

通过代码对比,用户组中 管理员与普通用户的抓取格式不一样,在代码中做了一次判断,对于 有的会员没有用户组,直接就传空值

抓取后的MySQL图:

看代码吧,里面的细节自己分析自己看!

 1 # -- coding: utf-8 --
 2 import pymysql
 3 import requests
 4 from lxml import etree
 5 headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
 6 class spiders(object):
 7     """爬取"""
 8     def html(self,url):
 9         """"抓取目标网页源码"""
10         html=response=requests.request("get",url,headers=headers)
11         if html.status_code ==200:
12             return response.text
13     def spider(self,html):
14         html1=etree.HTML(html)
15         title=html1.xpath("//title/text()")
16         try:
17             if str(title).find("个人资料"):
18                 username = html1.xpath(".//h2[@class='mbn']/text()")
19                 username = str(str(username[0]).split()).replace("[","").replace("]","")
20                 #对所有的获取信息进行去除[],()处理,字符格式化
21                 id = html1.xpath(".//span[@class='xw0']/text()")
22                 id = str(id).replace("UID: ", "").replace("(","").replace(")","").replace("[","").replace("]","")
23                 groups = html1.xpath(".//ul/li/span/a/text()")
24                 if len(groups) == 0:
25                     groups = html1.xpath(".//li[2]/span/a/font/text()")
26                 groups=str(groups).replace("[","").replace("]","")
27                     # 这里只抓取用户组和管理组,两个组的源码不一致 ,所以进行判断
28                 # print(username)
29                 # print(id)
30                 # print(groups)
31         except:
32             print("用户被禁用,忽略")
33             username="";id="";groups=""
34         return id,username,groups
35     def spider_newuser(self):
36         url = "https://www.bugku.com/"
37         html=self.html(url)
38         html1 = etree.HTML(html)
39         # new_user = html1.xpath("//title/text()")
40         new_user_id=html1.xpath(".//div[@id='content1_2']/div[@class='ptA st clear'][1]/h4/a/@href")
41         new_user_id=int(str(new_user_id).replace("space-uid-","").replace(".html","").replace('[','').replace(']','').replace("'",""))
42         return new_user_id
43 class writeobj(object):
44     """把爬取信息写入mysql"""
45     def write_pa(self,id,username,groups):
46         db=pymysql.connect(host="localhost",user="root",password="root",port=3306,db="DZ_spider")
47         if id=="":
48             return None
49         cursor=db.cursor()
50         table="users"
51         data={
52             "id":id,
53             "username":username,
54             "groups":groups
55         }
56         keys=",".join(data.keys())
57         values=tuple(data.values())
58         sql=f"INSERT INTO {table} VALUES{values}"
59         # print(sql)
60         try:
61             if cursor.execute(sql):
62                 print("successful")
63                 db.commit()
64         except:
65             print("Failed")
66             db.rollback()
67         db.close()
68 
69 if __name__ == '__main__':
70     spiders=spiders()
71     write = writeobj()
72     new_user_id = spiders.spider_newuser()
73                     # url = f"https://www.bugku.com/space-uid-3008.html"
74     for i in range(1,new_user_id):
75         url = f"https://www.bugku.com/space-uid-{i}.html"
76         html=spiders.html(url)
77         id,username,groups=spiders.spider(html)
78         write.write_pa(id, username, groups)

猜你喜欢

转载自www.cnblogs.com/fbhell/p/12458049.html