遍历文件夹获取git仓库远程地址列表

github 等平台有意思的仓库太多了,本地管理是个麻烦事,使用自己最笨拙的方法手动管理好心累。

不同网站不同作者的仓库有可能撞名,按照网址格式存储一层层往下找太麻烦了,索性写个脚本遍历。

Python 自带的 os.walk 方法不熟悉,造成无关的文件和文件夹也会挨个遍历,浪费太多时间,所以手动写个配置文件搜素函数。
yield 方法返回很好,但还没研究,等熟悉了在换掉 return 方法。

直接上代码:

#!/usr/bin/python
# -*- coding: UTF-8 -*-

import sys
import os
import configparser  # ini解析

class RepoLists_old(object):
    def __init__(self, scan_path:str) -> None:
        self.write_file = 'config_scan.txt'
        # print('# scan path : {}'.format(scan_path))
        open(self.write_file, '+wt').write('# scan path : {}\n'.format(scan_path))
        self.scan_path = scan_path
        self.config_paths = list()

    def find_config_files(self):
        for root, dirs, files in os.walk(self.scan_path):
            # print('scaning : {}'.format(root))
            # 输入的是包含路径时
            if '.git' in dirs:
                config_path = os.path.join(root, '.git', 'config')
                if os.path.exists(config_path):
                    yield config_path

    def config_parse(self, ini_path: str) -> str:
        config = configparser.ConfigParser()
        try:
            config.read(ini_path) # 'gbk' codec can't decode byte 0xa6 in position 254: illegal multibyte sequence
        except:
            # print('# read config file ERROR : {}'.format(ini_path))
            open(self.write_file, '+at').write('# read config file ERROR : {}\n'.format(ini_path))
        else:
            for section in config.sections():
                if 'remote' in section:
                    if 'url' in config[section]:
                        return config[section]['url']
            return ''
        finally:
            pass # 是否有异常时都会执行

    def show_infos(self):
        config_files = self.find_config_files()
        for config_file in config_files:
            url = self.config_parse(config_file)
            # print('file:{} url:{}'.format(config_file, url))
            open(self.write_file, '+at').write('file:{} url:{}\n'.format(config_file, url))


class RepoLists(object):
    def __init__(self, scan_path:str) -> None:
        self.write_file = 'config_scan.txt'
        open(self.write_file, '+wt').write('# scan path : {}\n'.format(scan_path))
        self.scan_path = scan_path
        self.config_paths = list()

    def find_config_files(self, scan_path):
        print('cwd : {}'.format(os.getcwd()))
        config_files = list()
        old_cwd = os.getcwd()
        os.chdir(path=scan_path)
        # 直接测试配置文件路径
        first_try = '.git'
        second_try = os.path.join(first_try, 'config')
        if os.path.exists(first_try) and os.path.isdir(first_try):
            if os.path.exists(second_try) and os.path.isfile(second_try):
                return [os.path.join(scan_path, second_try)]  # 返回构造为列表的配置文件路径
        # 这里不关心文件,只搜索文件夹
        dirs = []
        for it in os.listdir(path=scan_path):
            if os.path.isdir(it):
                dirs.append(it)
        dirs.sort()
        # 排序后再遍历
        for dir in dirs:
            config_files.extend(self.find_config_files(os.path.join(scan_path, dir)))
        os.chdir(old_cwd)
        return config_files

    def config_parse(self, ini_path: str) -> str:
        config = configparser.ConfigParser()
        try:
            config.read(ini_path)
        except:
            open(self.write_file, '+at').write('# read config file ERROR : {}\n'.format(ini_path))
        else:
            for section in config.sections():
                if 'remote' in section:
                    if 'url' in config[section]:
                        return config[section]['url']
            return ''

    def show_infos(self):
        config_files = self.find_config_files(self.scan_path)
        for config_file in config_files:
            url = self.config_parse(config_file)
            open(self.write_file, '+at').write('file:{} url:{}\n'.format(config_file, url))
        open(self.write_file, '+at').write('write finished!')


if __name__ == '__main__':
    if len(sys.argv) < 1:
        print("请输入扫描路径")
    RepoLists(sys.argv[1]).show_infos()

“os.walk()” 太耗时了,改掉之后虽然是遍历完再返回,但是返回时间可以等待,原先哪种方法一个目录就够等三五分钟的了,实在受不了。扫描结果保存到了"config_scan.txt"文件,方便进行下一步处理。

新的扫描方法是查找".git"目录,在该目录中查找"config"文件,使用“ini”进行解析,查找“[remote xxxx]”字段下的“url”并保存。有些项目的“config”文件加载时会出错,需要注意手动单独处理

猜你喜欢

转载自blog.csdn.net/u012101384/article/details/131369619