版权声明:本文纯属作者口胡,欢迎转载 https://blog.csdn.net/TQCAI666/article/details/89311144
文章目录
需求
- 数据获取
使用selenium从github爬取repo信息,为5元组(url,name,lauguage,description,fork) - 数据存储
存放在关系型数据库github_repo_db中,对于用户username,存放在表username中,name为主属性,表结构为(url,name,lauguage,description,fork) - 树形归类
用mongodb维护一个树形结构,其中repos为列表类型,存放repo的names - 界面与CRUD
用ajax与服务器交互,获取服务器的树形信息显示出来,并且可以对文件夹进行交互。要求有3个界面:
4.1. repos列表界面,对repos进行文件夹归类(可以放入多个文件夹)。
4.2. 文件夹与repos混合界面,为归类后的效果。
4.3. 文件夹编辑界面,对文件夹树形结构进行编辑。
编码
数据获取
使用selenium从github爬取repo信息,为5元组(url,name,lauguage,description,fork)
https://blog.csdn.net/TQCAI666/article/details/89226236
数据存储
存放在关系型数据库github_repo_db中,对于用户username,存放在表username中,name为主属性,表结构为(url,name,lauguage,description,fork)
Create Database If Not Exists github_repo_db;
以用户TQCAI
为例
Create Table If Not Exists github_repo_db.`TQCAI`(
url varchar(100),
name varchar(50) PRIMARY KEY ,
lauguage varchar(20),
description TEXT,
fork varchar(50)
);
repo_mysql.py
# -*- coding: utf-8 -*-
import pymysql
import random as rd
from random import choice
class MySqlOP(object):
HOST='localhost'
USER='root'
PASSWORD='密码'
def __init__(self,username):
# 打开数据库连接
self.dbname='github_repo_db'
self.username = username
# 连接MySQL并创建数据库
db = pymysql.connect(self.HOST, self.USER, self.PASSWORD)
with db:
cursor = db.cursor()
cursor.execute(rf'Create Database If Not Exists {self.dbname};')
db.commit()
# 连接数据库
self.db = pymysql.connect(self.HOST, self.USER, self.PASSWORD, self.dbname)
# 使用 cursor() 方法创建一个游标对象 cursor
self.cursor = self.db.cursor()
# 创建用户表
sql=fr'''Create Table If Not Exists {self.dbname}.`{self.username}`(
url varchar(500),
name varchar(400) PRIMARY KEY ,
lauguage varchar(20),
description TEXT,
fork varchar(500)
);
'''
self.cursor.execute(sql)
self.db.commit()
# 应用repo数据库
sql=f"use {self.dbname};"
self.cursor.execute(sql)
self.db.commit()
def deleteAll(self):
sql=f'delete from {self.username};'
self.cursor.execute(sql)
self.db.commit()
def close_db(self):
self.db.close()
def insert_data(self,url,name,lauguage,description,fork):
description = description.replace("'", r"\'")
description = description.replace('"', r'\"')
sql=f"insert into {self.username} values('{url}','{name}','{lauguage}','{description}','{fork}');"
print(sql)
self.cursor.execute(sql)
self.db.commit()
if __name__ == '__main__':
'''单元测试'''
sql=MySqlOP('TQCAI')
sql.insert_data('Computer-Networking-A-Top-Down-Approach-NOTES',
'https://github.com/TQCAI/Computer-Networking-A-Top-Down-Approach-NOTES',
'HTML' ,
'《计算机网络-自顶向下方法(原书第6版)》编程作业,Wireshark实验文档的翻译和解答。',
'https://github.com/moranzcw/Computer-Networking-A-Top-Down-Approach-NOTES')
sql.close_db()
scrap_github.py
from selenium import webdriver
from repo_mysql import MySqlOP
profile=webdriver.FirefoxOptions()
profile.add_argument('-headless') #设置无头模式
browser=webdriver.Firefox(options=profile)
user='TQCAI'
url=f'https://github.com/{user}?tab=repositories'
sql=MySqlOP(user)
sql.deleteAll()
isPageTurning=False # 是否翻页
# 大循环,爬取某用户的所有repo
while True:
# 对于每个页面
browser.get(url)
id=1
while True:
# repo名字和链接
try:
repo=browser.find_element_by_xpath(f'/html/body/div[4]/main/div/div[3]/div[3]/div[2]/ul/li[{id}]/div[1]/div[1]/h3/a')
except BaseException:
break
repoName=repo.text
repoHref=repo.get_attribute('href')
# 语言
try:
language=browser.find_element_by_xpath(f'/html/body/div[4]/main/div/div[3]/div[3]/div[2]/ul/li[{id}]/div[1]/div[3]/span[2]').text
except BaseException:
language=''
# repo描述
try:
description=browser.find_element_by_xpath(f'/html/body/div[4]/main/div/div[3]/div[3]/div[2]/ul/li[{id}]/div[1]/div[2]/p').text
except BaseException:
description=''
# fork 源
try:
fork=browser.find_element_by_xpath(f'/html/body/div[4]/main/div/div[3]/div[3]/div[2]/ul/li[{id}]/div[1]/div[1]/span/a').get_attribute('href')
except BaseException:
fork=''
id+=1
# 数据存储
sql.insert_data(repoHref,repoName,language,description,fork)
# print(repoHref,repoName,language,description,fork)
# 获取下一页的按钮
if isPageTurning:
btnID=2
else:
btnID=1
isPageTurning=True
try:
button = browser.find_element_by_xpath(f'/html/body/div[4]/main/div/div[3]/div[3]/div[2]/div/div/a[{btnID}]')
except BaseException:
print('结束了')
break
url = button.get_attribute('href')
print(url)
print(button.text)
browser.close()
sql.close_db()