项目:对github的repo进行树形分类

版权声明:本文纯属作者口胡,欢迎转载 https://blog.csdn.net/TQCAI666/article/details/89311144

需求

  1. 数据获取
    使用selenium从github爬取repo信息,为5元组(url,name,lauguage,description,fork)
  2. 数据存储
    存放在关系型数据库github_repo_db中,对于用户username,存放在表username中,name为主属性,表结构为(url,name,lauguage,description,fork)
  3. 树形归类
    用mongodb维护一个树形结构,其中repos为列表类型,存放repo的names
  4. 界面与CRUD
    用ajax与服务器交互,获取服务器的树形信息显示出来,并且可以对文件夹进行交互。要求有3个界面:
    4.1. repos列表界面,对repos进行文件夹归类(可以放入多个文件夹)。
    4.2. 文件夹与repos混合界面,为归类后的效果。
    4.3. 文件夹编辑界面,对文件夹树形结构进行编辑。

编码

数据获取

使用selenium从github爬取repo信息,为5元组(url,name,lauguage,description,fork)
https://blog.csdn.net/TQCAI666/article/details/89226236

数据存储

存放在关系型数据库github_repo_db中,对于用户username,存放在表username中,name为主属性,表结构为(url,name,lauguage,description,fork)

Create Database If Not Exists github_repo_db;

以用户TQCAI为例

Create Table If Not Exists github_repo_db.`TQCAI`(
  url varchar(100),
  name varchar(50) PRIMARY KEY ,
  lauguage varchar(20),
  description TEXT,
  fork varchar(50)
);

repo_mysql.py

# -*- coding: utf-8 -*-

import pymysql
import random as rd
from random import choice



class MySqlOP(object):
    HOST='localhost'
    USER='root'
    PASSWORD='密码'
    def __init__(self,username):
        # 打开数据库连接
        self.dbname='github_repo_db'
        self.username = username
        # 连接MySQL并创建数据库
        db = pymysql.connect(self.HOST, self.USER, self.PASSWORD)
        with db:
            cursor = db.cursor()
            cursor.execute(rf'Create Database If Not Exists {self.dbname};')
            db.commit()
        # 连接数据库
        self.db = pymysql.connect(self.HOST, self.USER, self.PASSWORD, self.dbname)
        # 使用 cursor() 方法创建一个游标对象 cursor
        self.cursor = self.db.cursor()
        # 创建用户表
        sql=fr'''Create Table If Not Exists {self.dbname}.`{self.username}`(
  url varchar(500),
  name varchar(400) PRIMARY KEY ,
  lauguage varchar(20),
  description TEXT,
  fork varchar(500)
);
'''
        self.cursor.execute(sql)
        self.db.commit()
        # 应用repo数据库
        sql=f"use {self.dbname};"
        self.cursor.execute(sql)
        self.db.commit()


    def deleteAll(self):
        sql=f'delete  from {self.username};'
        self.cursor.execute(sql)
        self.db.commit()

    def close_db(self):
        self.db.close()

    def insert_data(self,url,name,lauguage,description,fork):
        description = description.replace("'", r"\'")
        description = description.replace('"', r'\"')
        sql=f"insert into {self.username} values('{url}','{name}','{lauguage}','{description}','{fork}');"
        print(sql)
        self.cursor.execute(sql)
        self.db.commit()

if __name__ == '__main__':
    '''单元测试'''
    sql=MySqlOP('TQCAI')
    sql.insert_data('Computer-Networking-A-Top-Down-Approach-NOTES',
                    'https://github.com/TQCAI/Computer-Networking-A-Top-Down-Approach-NOTES',
                    'HTML' ,
                    '《计算机网络-自顶向下方法(原书第6版)》编程作业,Wireshark实验文档的翻译和解答。',
                    'https://github.com/moranzcw/Computer-Networking-A-Top-Down-Approach-NOTES')
    sql.close_db()

scrap_github.py


from selenium import webdriver
from repo_mysql import MySqlOP

profile=webdriver.FirefoxOptions()
profile.add_argument('-headless') #设置无头模式
browser=webdriver.Firefox(options=profile)
user='TQCAI'
url=f'https://github.com/{user}?tab=repositories'

sql=MySqlOP(user)
sql.deleteAll()
isPageTurning=False # 是否翻页
# 大循环,爬取某用户的所有repo
while True:
    # 对于每个页面
    browser.get(url)
    id=1
    while True:
        # repo名字和链接
        try:
            repo=browser.find_element_by_xpath(f'/html/body/div[4]/main/div/div[3]/div[3]/div[2]/ul/li[{id}]/div[1]/div[1]/h3/a')
        except BaseException:
            break
        repoName=repo.text
        repoHref=repo.get_attribute('href')
        # 语言
        try:
            language=browser.find_element_by_xpath(f'/html/body/div[4]/main/div/div[3]/div[3]/div[2]/ul/li[{id}]/div[1]/div[3]/span[2]').text
        except BaseException:
            language=''
        # repo描述
        try:
            description=browser.find_element_by_xpath(f'/html/body/div[4]/main/div/div[3]/div[3]/div[2]/ul/li[{id}]/div[1]/div[2]/p').text
        except BaseException:
            description=''
        # fork 源
        try:
            fork=browser.find_element_by_xpath(f'/html/body/div[4]/main/div/div[3]/div[3]/div[2]/ul/li[{id}]/div[1]/div[1]/span/a').get_attribute('href')
        except BaseException:
            fork=''
        id+=1
        # 数据存储
        sql.insert_data(repoHref,repoName,language,description,fork)
        # print(repoHref,repoName,language,description,fork)
    # 获取下一页的按钮
    if isPageTurning:
        btnID=2
    else:
        btnID=1
        isPageTurning=True
    try:
        button = browser.find_element_by_xpath(f'/html/body/div[4]/main/div/div[3]/div[3]/div[2]/div/div/a[{btnID}]')
    except BaseException:
        print('结束了')
        break
    url = button.get_attribute('href')
    print(url)
    print(button.text)
browser.close()
sql.close_db()

树形归类

成果

猜你喜欢

转载自blog.csdn.net/TQCAI666/article/details/89311144
今日推荐