Python下载NASA数据账号分享

账号名:luowenqiang
密码: lWq159357

可以免费下载文件
代码:

import requests
import os
import time
from bs4 import BeautifulSoup


SPICE_URL = 'https://podaac-tools.jpl.nasa.gov' #需要拼接的URL
URL = 'https://podaac-tools.jpl.nasa.gov/drive/files/allData/topex/L2'
COOKIE = '_ga=GA1.2.2044435566.1575709094; _gid=GA1.2.984665774.1575709094; PODAAC_Drive=Xetpr3Zh1Qbc-8ALgXh2FQAAABI' #记录登录Cookie需要配置
g_cookies = dict(map(lambda x: x.split('='), COOKIE.split(";"))) #转换成用于传输的Cookie
base_path = "D:\\xzydown\\"

def mkdir(path):
    # 引入模块
    import os
    # 去除首位空格
    path = path.strip()
    # 去除尾部 \ 符号
    path = path.rstrip("\\")
    # 判断路径是否存在
    # 存在     True
    # 不存在   False
    isExists = os.path.exists(path)
    # 判断结果
    if not isExists:
        # 如果不存在则创建目录# 创建目录操作函数
        os.makedirs(path)
        print('创建成功')
        return True
    else:
        # 如果目录存在则不创建,并提示目录已存在
        print('目录已存在')
        return False

def DownloadFile(url,path,m_cookie=''):
    '''
    :param url: 下载链接
    :param path: 保存路径
    :param m_cookie: 设置Cookie值(可为空)
    :return:
    '''
    headers = {'Proxy-Connection': 'keep-alive'}
    #设置Cookies
    r = requests.get(url, stream=True, headers=headers,cookies = g_cookies)
    length = float(r.headers['content-length'])
    f = open(path, 'wb')
    count = 0
    count_tmp = 0
    time1 = time.time()
    for chunk in r.iter_content(chunk_size=512):
        if chunk:
            f.write(chunk)
            count += len(chunk)
            if time.time() - time1 > 2:
                p = count / length * 100
                speed = (count - count_tmp) / 1024 / 1024 / 2
                count_tmp = count
                print(path + ': ' + formatFloat(p) + '%' + ' Speed: ' + formatFloat(speed) + 'M/S')
                time1 = time.time()
    f.close()
    print("文件保存在:"+path)

def formatFloat(num):
    return '{:.2f}'.format(num)

def getTestUrl(test_url):
    res = requests.get(test_url,cookies=g_cookies)
    bs_html = BeautifulSoup(res.text,'lxml')
    # 获取
    tmd = bs_html.find_all(class_= 'table-responsive')
    if len(tmd) == 0:
        #此处是未知类型的下载 测试一波
        sp_list = test_url.split('/')
        sp_name = sp_list[len(sp_list)-1]
        sp_path = base_path + sp_list[len(sp_list)-2] + "\\"
        mkdir(sp_path) #创建文件夹
        DownloadFile(test_url,sp_path+sp_name) #下载该文件
        return
    a_list = tmd[0].find_all('a')
    for l in a_list:
        # 过滤掉这个
        if l.text != 'Parent Directory':
            cur_url = SPICE_URL + l.get('href').strip()
            # 添加判断是否需要下载
            if cur_url.find('.',30) < 0:
                #继续遍历
                getTestUrl(cur_url)
            else:
                #此处可以直接下载:
                cur_url_name_list = cur_url.split('/')
                cur_url_name = cur_url_name_list[len(cur_url_name_list)-1]
                down_path = base_path + cur_url_name_list[len(cur_url_name_list)-2]+"\\"
                mkdir(down_path)
                DownloadFile(cur_url,down_path+cur_url_name)
                print("需要下载的链接:" + cur_url)
    #下载方法
if __name__ == '__main__':
    getTestUrl(URL)
    print('============================运行结束标志===========================')
发布了365 篇原创文章 · 获赞 80 · 访问量 35万+

猜你喜欢

转载自blog.csdn.net/Giser_D/article/details/103462475