Location>code7788 >text

Member Purchase Project Interview Questions: Efficient Data Capture and Exception Handling

Popularity:672 ℃/2024-07-28 19:21:12

Member Purchase Program

bright spot

  • Logging information
  • Concurrent asynchronous crawl data, greatly improving the crawl speed
  • Catch exceptions and add a retry mechanism

source code (computing)

import logging
import time

import requests
import asyncio
import aiohttp
from aiohttp import ContentTypeError
import csv

# Configuration log
(level=, format='%(asctime)s - %(levelname)s : %(message)s')


# parsing data
def parse_data(data):
    if data:
        for meeting in data:
            project_id = meeting['project_id']
            project_name = meeting['project_name']
            start_time = meeting['start_time']
            venue_name = meeting['venue_name']
            price_low = meeting['price_low'] / 100
            price_high = meeting['price_high'] / 100
            yield {
                'project_id': project_id,
                'project_name': project_name,
                'start_time': start_time,
                'venue_name': venue_name,
                'price_low': price_low,
                'price_high': price_high
            }


# Save tocsvPapers
def save_file(city_info, city_id):
    if city_info:
        with open(f'{city_id}.csv', 'a+', newline='', encoding='utf-8') as f:
            writer = (f)

            ([f'{city_info["project_id"]}', f'{city_info["project_name"]}', f'{city_info["start_time"]}',
                             f'{city_info["venue_name"]}', f'{city_info["price_low"]}', f'{city_info["price_high"]}'])


class Myspider(object):
    types_list = ['act (in a play)', 'exhibit', 'local life']
    cities_id_list = []
    failed_urls = []

    CONCURRENTCY = 4
    RETRY_LIMIT = 3

    def __init__(self):
         = None
         = ()

    # Get city number and set class properties
    @staticmethod
    def set_cities_id():
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'}
        cities_data = ("/api/ticket/city/list?channel=4", headers=headers).json()[
            'data']
        developed_cities_id = [city['id'] for city in cities_data['list']]
        developing_cities_id = [city['id'] for part in cities_data['more'] for city in part['list']]
        Myspider.cities_id_list = developed_cities_id + developing_cities_id
        return None

    # Resolving individual tasks,Crawl related information
    async def get_every_page_info(self, url):
        async with :
            (f"scraping {url}")
            for attempt in range(Myspider.RETRY_LIMIT):
                try:
                    async with (url) as response:
                        data = await ()
                        return data["data"]["result"]
                except ContentTypeError:
                    (f"error ocurred when scraping {url}", exc_info=True)
                except as e:
                    (f"ClientError on {url}: {e}", exc_info=True)
                    if attempt < Myspider.RETRY_LIMIT - 1:
                        await (2 ** attempt) # Exponential backoff
                        continue
                except :
                    (f"Server disconnected: {url}", exc_info=True)
                    if attempt < Myspider.RETRY_LIMIT - 1:
                        await (2 ** attempt)
                        continue
            Myspider.failed_urls.append(url)
            return None # Return None if all retry attempts fail

    # gain under this category under this city Maximum number of pages
    def get_max_page(self, url):
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'}
        response = (url, headers=headers)
        data = ()
        return data["data"]["numPages"]

    # methodology, gain任务列表, write out (a prescription, check, invoice etc)4I'm going to grab the
    async def main(self):
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'}
        # initializationsession(premier plusheaderHeader information and proxies,cookieisobaric information)
        async with (headers=headers) as session:
             = session
            for type in Myspider.types_list:
                for city_id in Myspider.cities_id_list:
                    begin_url = "/api/ticket/project/listV2?version=134&page=1&pagesize=16&area={}&filter=&platform=web&p_type={}".format(
                        city_id, type)
                    max_page = self.get_max_page(begin_url)
                    # Generate a task list
                    scrapy_tasks = [self.get_every_page_info(
                        "/api/ticket/project/listV2?version=134&page={}&pagesize=16&area={}&filter=&platform=web&p_type={}".format(
                            page, city_id, type)) for page in range(1, max_page + 1)]
                    # Concurrent execution of tasks,gain执行结果
                    scrapy_results = await (*scrapy_tasks)
                    # Parsing Result Data
                    for result in scrapy_results:
                        data = parse_data(result)
                        for city_info in data:
                            print(city_info)
                            save_file(city_info, city_id)
            # Close connection
            await ()


if __name__ == '__main__':
    # write out (a prescription, check, invoice etc)始时间
    start_time = ()
    # gain城市编号,Setting class propertiescities_id_list
    Myspider.set_cities_id()
    # initializationMyspider
    spider = Myspider()
    # Creating an Event Loop Pool
    loop = asyncio.get_event_loop()
    # enrollment
    loop.run_until_complete(())
    # closing event
    end_time = ()
    (f"total_time: {end_time - start_time}")

    # print(spider.get_max_page('/api/ticket/project/listV2?version=134&page=1&pagesize=16&area=110100&filter=&platform=web&p_type=%E5%85%A8%E9%83%A8%E7%B1%BB%E5%9E%8B'))

More details: [CodeRealm]