SensitiveInformationDetection敏感信息检测工具（暴露面检测）

前言

前段时间在工作中碰到了一些需要大量人工检测的工作——暴露面检测，其最后一步要人工检测该页面是否存在敏感信息泄露问题，需要人工判断，遂写了一个自动检测网页敏感信息的脚本。

介绍

该脚本主要以中间件版本、其他版本、是否有源码泄露、敏感信息检测和是否存在下载行为这几方面来检测。

由于禁用了SSL证书验证，所以会存在一些安全性的问题，请酌情使用。

原理为爬取网页内容，使用正则表达式匹配关键词，如果有特殊关键字可以自行修改、添加正则表达式来完善成自己所想使用的代码。

中间件版本

主要从以下版本中检测（基本涵盖了市面上常见的中间件）

Tomcat
WebLogic
Jboss
Jetty
Webshere
Glassfish
Nginx
Apache
Microsoft IIS
Kafka
RabbitMQ
Redis
Elasticsearch
MongoDB
MySQL
Node.js
Express.js
Django

其他版本

主要是对【数字.数字】【数字.数字.数字】这两种格式的版本进行检测

是否有源码泄露

因为网页大部分是由HTML书写，所以在匹配时先匹配HTML将其剔除，在匹配其他语言（包括下面代码块这些），主要是从该语言中的常见语法单词匹配。

HTML

Python
JavaScript
Java
C++
Go

敏感信息检测

主要检测以.com 和.cn结尾的邮箱；以13、14、15、18和17开头的11位中国大陆手机号；以及中国大陆身份证号

是否存在下载行为

匹配抓到的数据包，响应包中存在httpd/unix-directory或者application即可能存在下载行为，如果遇到特殊的情况也可以自行添加

使用

在脚本中我引入了argparse模块，-h查看使用方法

在这里插入图片描述

只添加了一条URL检测和文件批量检测两种方式，也可以将扫描结果输出为文件，加入了线程更快的处理多条数据（默认为5）。

源代码

import argparse
import re
import requests
import threading
from tabulate import tabulate
import urllib3
from tqdm import tqdm

# 禁用SSL证书验证
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# 编译正则表达式
REGEX_DICT = {
    
    
    'Tomcat': r'Apache\s*Tomcat/([\d\.]+)',
    'Weblogic': r'Oracle\s*WebLogic\s*Server/([\d\.]+)',
    'Jboss': r'JBoss/([\d\.]+)',
    'Jetty': r'Jetty/([\d\.]+)',
    'Webshere': r'IBM\s*WebSphere/([\d\.]+)',
    'Glassfish': r'GlassFish/([\d\.]+)',
    'Nginx': r'nginx/([\d\.]+)',
    'Apache': r'Apache/([\d\.]+)',
    'Microsoft IIS': r'Microsoft-IIS/([\d\.]+)',
    'Kafka': r'Apache\s*Kafka/([\d\.]+)',
    'RabbitMQ': r'RabbitMQ/([\d\.]+)',
    'Redis': r'Redis/([\d\.]+)',
    'Elasticsearch': r'Elasticsearch/([\d\.]+)',
    'MongoDB': r'MongoDB/([\d\.]+)',
    'MySQL': r'MySQL/([\d\.]+)',
    'Node.js': r'X-Powered-By: Express',
    'Express.js': r'X-Powered-By: Express',
    'Django': r'X-Powered-By: Django'
}
COMPILED_REGEX_DICT = {
    
    middleware: re.compile(regex, re.IGNORECASE) for middleware, regex in REGEX_DICT.items()}

SENSITIVE_INFO_REGEX_LIST = [
    r'([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.(cn|com))',
    r'((13|14|15|18|17)+[0-9]{9})',
    r'(\d{17}[\d|x]|\d{15})',
]

PROGRAMMING_LANGUAGES = {
    
    
    'HTML': '<html>|<!DOCTYPE',
    'Python': 'import\s+|def\s+|print\s*\(|from\s+',
    'JavaScript': 'function\s+|console\.',
    'Java': 'public\s+class\s+|import\s+java\.',
    'C++': '#include\s+<|using\s+namespace\s+std',
    'Go': 'go\s+'
}

# 爬取网页内容
def read_content(url):
    try:
        requests.packages.urllib3.disable_warnings()
        response = requests.get(url, verify=False, timeout=3)
        response.raise_for_status()
        content = response.text
        return content
    except requests.exceptions.RequestException as e:
        print(f"请求异常: {
      
      str(e)}")
        return "NONE"

# 查找中间件和其他版本
def find_versions(content, compiled_regex_dict):
    found_middleware = []
    found_other = []
    other_versions_regex = re.compile(r'(\d+(?:\.\d+){1,2})')

    for middleware, compiled_regex in compiled_regex_dict.items():
        matches = compiled_regex.findall(content)
        if matches:
            found_middleware.append(f"{
      
      middleware}{
      
      matches[0]}")
            break
    else:
        found_versions = other_versions_regex.findall(content)
        found_versions = [version for version in found_versions if re.match(r'^\d+(?:\.\d+){1,2}$', version)]
        if found_versions:
            found_other.extend(found_versions)

    return found_middleware, found_other

# 版本检测
def version_detection(content, compiled_regex_dict):
    middleware, other = [], []

    if content:
        middleware, other = find_versions(content, compiled_regex_dict)

    return middleware[0] if middleware else "NONE", other[0] if other else "NONE"

# 匹配网页内容中的编程语言
def match_programming_language(content):
    for language, pattern in PROGRAMMING_LANGUAGES.items():
        if re.search(pattern, content):
            if language == "HTML":
                return "NONE"
            else:
                return language
    
    return "NONE"

# 检测敏感信息
def check_sensitive_info(content):
    sensitive_info = []
    for regex in SENSITIVE_INFO_REGEX_LIST:
        matches = re.findall(regex, content)
        if matches:
            for match in matches:
                sensitive_info.extend(match)
    
    if len(sensitive_info) > 0:
        return "Possible"
    else:
        return "NONE"

#判断是否为下载链接 
def is_downloadable(url):
    try:
        r = requests.head(url, allow_redirects=True, verify=False)
        content_type = r.headers.get('content-type')
        if content_type and content_type.startswith('application'):
            return "Possible"
        elif content_type and content_type.startswith('httpd/unix-directory'):
            return "Possible"
    except requests.exceptions.RequestException as e:
        return "NONE"
    return "NONE"

# 输出结果
def output_results(output, output_file=None):
    headers = ["URL", "Middleware version", "Other version", "Source code leakage", "information leakage", "Download files?"]
    table = tabulate(output, headers, tablefmt='simple')
    if output_file:
        with open(output_file, 'w', newline='') as file:
            print(table)
            writer = file.write(table)
    else:
        print(table)

# 线程函数
def worker(url, COMPILED_REGEX_DICT):
    semaphore.acquire()  # 获取信号量，如果超过最大线程数量会阻塞
    try:
        content = read_content(url)
        middleware, other = version_detection(content, COMPILED_REGEX_DICT)
        language = match_programming_language(content)
        sensitive = check_sensitive_info(content)
        downloadable = is_downloadable(url)
        output.append([url, middleware, other, language, sensitive, downloadable])
    finally:
        semaphore.release()  # 释放信号量，以便其他线程可以获取


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='敏感信息检测')
    parser.add_argument('-u', '--url', type=str, help='URL')
    parser.add_argument('-f', '--file', type=str, help='输入文件')
    parser.add_argument('-o', '--output', type=str, help='输出文件')
    parser.add_argument('-t', '--thread', type=int, help='线程数量(默认为5)')
    args = parser.parse_args()

    output = []

    if args.url:
        content = read_content(args.url)
        middleware, other = version_detection(content, COMPILED_REGEX_DICT)
        language = match_programming_language(content)
        sensitive = check_sensitive_info(content)
        downloadable = is_downloadable(args.url)
        output.append([args.url, middleware, other, language, sensitive, downloadable])
        output_results(output, args.output)
    elif args.file:
        # 创建线程信号量
        if args.thread:
            thread = args.thread
        else:
            thread = 5
        semaphore = threading.BoundedSemaphore(thread)
        with open(args.file, 'r') as f:
            urls = f.read().splitlines()
        with tqdm(total=len(urls)) as pbar:
            threads = []
            for url in urls:
                t = threading.Thread(target=worker, args=(url, COMPILED_REGEX_DICT))
                t.start()
                pbar.update(1)
                threads.append(t)
            # 等待所有线程完成
            for t in threads:
                t.join()
        output_results(output, args.output)

测试

在这里插入图片描述

测试案例中（192.168.164.134:8080是我自己搭建的web服务，使用的是Tomcat8.5.19）

第一个是不知名版本泄露

第二和第三个都是Tomcat版本泄露

第四个不知名版本泄露和JavaScript源码泄露

第五个是可能存在信息泄露，信息如下所示，包含了邮箱

123456

[email protected]

第六、七、八个是存在下载行为的响应包中存在httpd/unix-directory或者application

免责声明

本工具仅能在取得足够合法授权的企业安全建设中使用，在使用本工具过程中，您应确保自己所有行为符合当地的法律法规。如您在使用本工具的过程中存在任何非法行为，您将自行承担所有后果，本工具所有开发者和所有贡献者不承担任何法律及连带责任。除非您已充分阅读、完全理解并接受本协议所有条款，否则，请您不要使用本工具。您的使用行为或者您以其他任何明示或者默示方式表示接受本协议的，即视为您已阅读并同意本协议的约束。