def crawl(self, tasks: list, task_handler=None, callback=None, semaphore_count=100):
if not task_handler:
task_handler = self.page_handler
main_loop = asyncio.new_event_loop()
asyncio.set_event_loop(main_loop)
semaphore = asyncio.Semaphore(semaphore_count)
n = len(tasks)
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
new_tasks = []
async def _run():
async with semaphore:
async with ClientSession() as session:
for i, task in enumerate(tasks):
future = asyncio.ensure_future(task_handler(task, i=i + 1, n=n, session=session))
if callback:
future.add_done_callback(callback)
new_tasks.append(future)
return await asyncio.gather(*new_tasks)
try:
result = main_loop.run_until_complete(_run())
return [x for j in result for x in j]
except Exception as e:
logging.exception(e)
finally:
if not main_loop.is_closed():
main_loop.close()
return []
async def page_handler(self, task, session, **kwargs):
''' 省略部分代码 '''
result = self.crawl(book_item_url_list, self.item_handler)
return result
-
怎样创建一个唯一 ClientSession()
- session = ClientSession() 这样不好使
-
在 page_handler 再次条用 crawl 出现
- Cannot run the event loop while another loop is running
- crawl 里面的任务会在产生子任务, 这个子任务如何用 loop 加入任务列表
-
我传了 callback 又用了 return 这是不是又点矛盾
-
我打印 i/n 这个 i 是乱序 怎样任务列表顺序执行