pycurl 单请求和并发请求处理
2017-06-25
pycurl 是什么
pycurl
是python中针对curl
的封装, 功能强大, 性能优异,
应用
当我们需要在应用中, 向其他服务发起http
请求的时候, pycurl
就派上用场了
import pycurl
import urllib.parse
from io import BytesIO
c = pycurl.Curl()
buffer = BytesIO()
url = 'https://www.baidu.com'
c.setopt(c.WRITEFUNCTION, buffer.write)
c.setopt(c.CONNECTTIMEOUT, 1) # 1s 连接超时设置
c.setopt(c.TIMEOUT, 3) # 3s 整个请求超时设置
c.setopt(c.CUSTOMREQUEST, "GET")
c.setopt(c.DNS_CACHE_TIMEOUT, 3600)
c.setopt(c.URL, url)
c.perform()
resp = buffer.getvalue()
print(resp)
可以得到http 响应
b'<html>\r\n<head>\r\n\t<script>\r\n\t\tlocation.replace(location.href.replace("https://","http://"));\r\n\t</script>\r\n</head>\r\n<body>\r\n\t<noscript><meta http-equiv="refresh" content="0;url=http://www.baidu.com/"></noscript>\r\n</body>\r\n</html>'
得到的resp 是个byte类型的字符串, 所以你可能要做个转码
resp = resp.decode('utf-8')
如果是post
请求, 结合我之前写的python构造类http_build_query函数简版, 构造post body, 如下
import pycurl
import urllib.parse
from io import BytesIO
c = pycurl.Curl()
buffer = BytesIO()
url = 'https://www.baidu.com'
c.setopt(c.WRITEFUNCTION, buffer.write)
c.setopt(c.CONNECTTIMEOUT, 1) # 1s 连接超时设置
c.setopt(c.TIMEOUT, 3) # 3s 整个请求超时设置
c.setopt(c.CUSTOMREQUEST, "POST")
c.setopt(c.DNS_CACHE_TIMEOUT, 3600)
c.setopt(c.URL, url)
params = {'name': 'Yi_Zhi_Yu'...}
request_body = url_encoder(params) # 这是我们之前写的构造post body
c.setopt(c.POSTFIELDS, request_body)
c.perform()
resp = buffer.getvalue()
print(resp)
当然, 如果还需要得到其他响应信息, 比如http code, response header, 如下
http_code = c.getinfo(c.HTTP_CODE)
http_body = buffer.getvalue()
content_type = c.getinfo(c.CONTENT_TYPE)
http_headers = {'Content-Type': content_type}
并发请求
有的时候我们可能需要发起并发请求, 比如从多个http 服务获取数据, 然后整合,
如果是上面的单个http 请求, 会因为阻塞原因, 使得时间耗时叠加,
pycurl 给我们提供了一种方式, 允许我们并发请求, 这样的方式的请求时间消耗就低多了
def multi_curl_request(request_params):
m = pycurl.CurlMulti()
reqs = []
for request_param in request_params:
c = pycurl.Curl()
c.setopt(c.WRITEFUNCTION, buffer.write)
c.setopt(c.CONNECTTIMEOUT, 1) # 1s 连接超时设置
c.setopt(c.TIMEOUT, 3) # 3s 整个请求超时设置
c.setopt(c.CUSTOMREQUEST, request_param["method"])
c.setopt(c.DNS_CACHE_TIMEOUT, 3600)
c.setopt(c.URL, request_param['url'])
buffer = BytesIO()
c.setopt(c.WRITEFUNCTION, buffer.write)
m.add_handle(c)
req = [buffer, c]
reqs.append(req)
num_handlers = len(reqs)
while num_handlers:
ret = m.select(2.0)
if ret == -1: # 请求超时标记
continue
while 1:
ret, num_handlers = m.perform()
if ret != pycurl.E_CALL_MULTI_PERFORM: break
resps = []
for req in reqs:
resp, c = req
http_code = c.getinfo(c.HTTP_CODE)
http_body = resp.getvalue()
content_type = c.getinfo(c.CONTENT_TYPE)
http_headers = {'Content-Type': content_type}
curl_info = {
'url': c.getinfo(c.EFFECTIVE_URL),
'connect_time': c.getinfo(c.CONNECT_TIME),
'namelookup_time': c.getinfo(c.NAMELOOKUP_TIME),
'total_time': c.getinfo(c.TOTAL_TIME),
}
resps.append([http_code, http_body, http_headers, curl_info])
return resps
这部分比较南里理解, 尤其是使用while
循环的部分
简单来说, num_handlers
表示正在进行数据传输的请求处理数量, while
监控这个数量, 每次成功请求, num_handlers
就会减少, 直到0个
while 1:
ret, num_handlers = m.perform()
if ret != pycurl.E_CALL_MULTI_PERFORM: break
这一部分用于监控是否有请求处理完成, 如果有的话, break, 没有的话, 继续监控