This error is because the proxy IP contains non-ASCII characters, and the encoding Scrapy uses by default is ASCII and cannot be converted to bytecode. This problem can be solved by specifying the encoding in the request header in the middleware as follows:
from scrapy import settings
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
import random
import requests
class RandomProxyMiddleware(HttpProxyMiddleware):
def __init__(self, auth_encoding='utf-8', proxy_list=None):
self.proxy = settings.PROXY
def process_request(self, request, spider):
proxy = random.choice(self.proxy)
if self.check_proxy(proxy):
print('当前使用的代理IP是:', proxy)
request.meta['proxy'] = proxy
else:
self.process_request(request, spider)
def check_proxy(self, proxy):
try:
requests.get('https://www.eastmoney.com/', proxies={'http': proxy}, timeout=3)
return True
except:
return False
def process_request(self, request, spider):
proxy = random.choice(self.proxy)
if self.check_proxy(proxy):
print('当前使用的代理IP是:', proxy)
request.meta['proxy'] = proxy
# 指定编码方式为UTF-8
request.headers.setdefault('Accept-Encoding', 'gzip, deflate')
request.headers.setdefault('Content-Type', 'text/html; charset=utf-8')
else:
self.process_request(request, spider)
In the process_request method, you added code specifying the encoding so that you can avoid the UnicodeEncodeError.