5/25

import requests
from lxml import etree
import re
import json
from pyquery import PyQuery
import time

page = 1
while page<6:
if page==1:
url = 'http://www.hbbidcloud.cn/suizhou/jyxx/004002/about.html'
page+=1
else:
url = 'http://www.hbbidcloud.cn/suizhou/jyxx/004002/{}.html'.format(page)
page += 1
r = requests.get(url)
s = PyQuery(r.text)
tender_names = s('#main > div > div.ewb-info-bd > ul > li > div > a').items()
get_dates = s('#main > div > div.ewb-info-bd > ul > li > span').items()
for i,j in zip(tender_names,get_dates):
original_url = 'http://www.hbbidcloud.cn{}'.format(i.attr('href'))
tender_name = i.attr('title')
get_date = j.text()
if 'HBSU' in tender_name:
register_num = 'HBSU' + str(tender_name).split('HBSU')[1].split(')')[0]
else:
register_num = ''

    print('!!!!!!',tender_name)

    detail_r = requests.get(original_url)
    detail_s = PyQuery(detail_r.text)
    text1 = detail_s('body > div.ewb-container > div.ewb-article > div > div.ewb-article-info').text().replace(' ', '').replace(' ', '').replace('\n', '')
    tenderee = re.findall('招标人:(.*?)代理机构', text1)[0]
    print(tenderee)
    if '电话' in text1:
        tel = re.findall('电话:(.*?)电话',text1)[0]
    else:
        tel = ''
    print(tel)

    agency = re.findall('代理机构:(.*?)地址', text1)[0]
    print(agency)
    # nums = re.findall(u'编号[::为]?([\(\)\w\d-]{7,25})', text1)
    # if nums:
    #     item['register_num'] = nums[0]

    dates = re.findall(u'截止时间.*?(20\d\d\D{1,5}\d{1,2}\D{1,5}\d{1,2})', text1)
    if not dates:
        dates = re.findall(u'应于(20\d\d\D{1,5}\d{1,2}\D{1,5}\d{1,2})日.{0,10}前', text1)
    if not dates:
        dates = re.findall(u'至(20\d\d\D{1,5}\d{1,2}\D{1,5}\d{1,2})', text1)
    item_deadline = '' if not dates else dates[0] + '日'
    print(item_deadline)

    if '万元' in text1:
        tender_scale = re.split('万元', text1)[0]
        tender_scale = re.split(':|:|,|投资|概算|约为|约', tender_scale)[-1]
        if 4 < len(tender_scale) < 12:
            if not tender_scale:
                item_tender_scale = ''
            else:
                item_tender_scale = tender_scale.replace('额为', '').replace('额', '') + '万元'
        else:
            item_tender_scale = ''
    elif '元' in text1:
        tender_scale = re.split('元', text1)[0]
        tender_scale = re.split(':|:|,|;|投资|概算|约为|约', tender_scale)[-1]
        if 4 < len(tender_scale) < 12:
            item_tender_scale = tender_scale + '元'
        else:
            item_tender_scale = ''
    elif '亿' in text1:
        tender_scale = re.split('亿', text1)[0]
        tender_scale = re.split(':|:|,|;|投资|概算|约为|约', tender_scale)[-1]
        item_tender_scale = tender_scale.replace('额', '') + '亿'
    else:
        item_tender_scale = ''
    print(item_tender_scale)

    qualified = ''
    # if ('投标申请人条件' in text1 or '申请人资格' in text1) and '人资质类别和等级' not in text1:
    #     qualified = re.split('申请人[条资][格件][::]',text1,1)[1]
    if '人资质类别和等级' in text1:
        qualified = re.split('人资质类别和等级[::]', text1, 1)[1]
    elif '投标人资质条件' in text1 or '投标人资格要求' in text1 or '投标人资格条件' in text1 or '投标人资质要求' in text1:
        qualified = re.split('投标人资[质格][要条][求件]', text1, 1)[1]
    elif '投标企业资质条件' in text1 or '投标企业资格要求' in text1 or '投标企业资格条件' in text1 or '投标企业资质要求' in text1:
        qualified = re.split('投标企业资[质格][要条][求件][:|:]', text1, 1)[1]
    elif '投标人须具' in text1:
        qualified = re.split('投标人须具[有备]', text1, 1)[1]
    elif '投标人条件:' in text1 or '投标人要求:' in text1:
        qualified = re.split('投标人[条要][求件][::]', text1, 1)[1]
    elif '投标人的资格要求:' in text1:
        qualified = re.split('投标人的资格要求:', text1, 1)[1]
    elif '申请人资格要求' in text1:
        qualified = re.split('申请人资格要求', text1, 1)[1]

    if '一级' in text1 or '二级' in text1 or '三级' in text1:
        qualified4 = re.findall('[:,。.](.*?[一二三]级)(.*?)[:,。.]', qualified)
        if not qualified4:
            qualified4 = re.findall('(.*?[一二三]级)(.*?)[:,。.]', qualified)
            qualified4 = qualified4[0][0] + qualified4[0][1]
        else:
            qualified4 = qualified4[0][0] + qualified4[0][1]
    elif '壹' in text1 or '贰' in text1 or '弎' in text1 or '甲' in text1 or '乙' in text1 or '丙' in text1 or '叁' in text1:
        qualified4 = re.findall('[:,。.](.*?[壹贰弎叁甲乙丙]级)(.*?)[:,。.]', qualified)
        qualified4 = qualified4[0][0] + qualified4[0][1]
    else:
        qualified4 = re.split('[。,]', qualified)[0]
    if len(qualified4) > 255:
        item_bidder_qualification = qualified4.split(',')[0].replace('3', '')
    else:
        item_bidder_qualification = qualified4.replace('3', '')
    print(item_bidder_qualification)

    try:
        leader_qualification = re.split('总监理工程师|项目经理|项目负责人|项目总监|负责人', text1, 1)[1]
        if '存在控股' in leader_qualification:
            item_leader_qualification = ''
        else:
            item_leader_qualification = re.split('[;,。]', leader_qualification)[0].replace(':', '').replace(':',
                                                                                                               '').replace(
                '或', '').replace(',', '').replace('、', '')
    except:
        item_leader_qualification = ''
    print(item_leader_qualification)

猜你喜欢

转载自www.cnblogs.com/marier/p/12953768.html