思路:在登录页面获取令牌,发送发帖,附带上已获取的令牌
参考:https: //www.jianshu.com/p/d73e971da41c
import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWe\
bKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
'Referer': '',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'
}
url = ''
sss = requests.Session()
response = sss.get(url, headers=headers)
page = etree.HTML(response.text)
page = page.xpath('//input[1]/@value')
token = page[0]
data = {
'csrfmiddlewaretoken': token,
'username': 'zuolibing', 'password': 'zuolibing'}
r = sss.post(url, headers=headers, data=data)
print(r.text)
scarpy版本:
参考:https: //www.jianshu.com/p/9d1e00dc40e4
from scrapy.spiders import CrawlSpider
from scrapy.http import FormRequest, Request
class LoginSpider(CrawlSpider):
name = 'login'
allowed_domains = ['web']
def start_requests(self):
'''
向页面发起请求requests
'''
return [
Request(
'url',
callback=self.parse_welcome
)
]
def parse_welcome(self, response):
'''
收到返回的response, response中包含所有隐藏的字段, 构造response和表单参数,
并再次发起请求
'''
return FormRequest.from_response(
response,
formdata={
'username': 'zuolibing',
'password': 'zuolibing'
}
)
第三版:
import scrapy
# 利用response中返回的隐藏字段,构造表单并发起请求
class LoginSpider(scrapy.Spider):
name = 'login'
start_urls = ['']
def parse(self, response):
return scrapy.FormRequest.from_response(
response,formdata={
'username': 'zuolibing',
'password': 'zuolibing'
},callback=self.parse_link
)
def parse_link(self, response):
item = response.xpath('//*[@class="tab-pane fade in active"]/h5/text()').extract()[0]
print(item)
scrapy使用cookie登陆 :
# coding=utf-8
import scrapy
class LoginSpider(scrapy.Spider):
name = 'login'
start_urls = ['url']
cookies = {
'csrftoken':'V6uSztzBUbnGScC3ds8pbLqhdnsoc4Wj;', 'sessionid':'74zm6gfnevp24nf15174ei9uqa6d01jh'
}
def start_requests(self):
yield scrapy.FormRequest(url=self.start_urls[0], cookies=self.cookies, callback=self.parse_url)
def parse_url(self, response):
element = response.xpath('//*[@class="tab-pane fade in active"]/h5/text()').extract()[0]
print(element)