• Python爬取阿里巴巴商城数据


    目录

    1.前言

    2、解决方案

    3、现在开始上代码实现

    4、最后总结:

                                我是政胤 期待你的关注



    1.前言

    大家好 我是每天走在刑的第一线的政胤

    今天教大家获取阿里巴巴的列表页商品信息包含,商品title,商品主图片并且需要存入xls文件保存  我是政胤 制作不易点个免费的关注吧

    2、解决方案

         首先给出的方案是:

         2.1、通过wxPython框架写出一个可视化界面,

         2.2、因为阿里巴巴防爬比较严重,所以我直接通过selenium进行用户超过来跳过反扒机制

         2.3、编写浏览器池方便实现多线程爬取数据

         2.4、编写爬数据业务逻辑

    3、现在开始上代码实现

         3.1 首先初始先一个浏览器池子

    1. from multiprocessing import Manager
    2. from time import sleep
    3. from tool.open_browser import open_browser
    4. class DriverPool:
    5.     def __init__(self, max_nums,driver_path,ui,open_headless=0):
    6.         self.ui = ui
    7.         self.drivers = {}
    8.         self.manager = Manager()
    9.         self.queue = self.manager.Queue()
    10.         self.max_nums = max_nums
    11.         self.open_headless = open_headless
    12.         self.CreateDriver(driver_path)
    13.     def CreateDriver(self,driver_path):
    14.         '''
    15.         初始化浏览器池
    16.         :return
    17.         '''
    18.         for name in range(1, self.max_nums + 1):
    19.             name = f'driver_{name}'
    20.             d = open_browser(excute_path=driver_path,open_headless=self.open_headless)
    21.             d.ui = self.ui
    22.             self.drivers[name] = d
    23.             self.queue.put(name)
    24.     def getDriver(self):
    25.         '''
    26.         获取一个浏览器
    27.         :return driver
    28.         '''
    29.         if self.queue.empty():
    30.             sleep(1)
    31.             return self.getDriver()
    32.         name = self.queue.get()
    33.         driver = self.drivers[name]
    34.         driver.pool_name_driver = name
    35.         return driver
    36.     def putDriver(self, name):
    37.         '''
    38.         归还一个浏览器
    39.         :param name: 
    40.         :return: 
    41.         '''
    42.         self.queue.put(name)
    43.     def quit(self):
    44.         '''
    45.         关闭浏览器,执行结束操作
    46.         :return: 
    47.         '''
    48.         if self.drivers:
    49.             for driver in self.drivers.values():
    50.                 try:
    51.                     driver.quit()
    52.                 except:
    53.                     pass

         3.2 编写UI操作界面

    1.     def intUIRun(self):
    2.         '''
    3.         初始化UI主界面
    4.         :return:
    5.         '''
    6.         pannel = wx.Panel(self.panel_run)
    7.         pannel.Sizer = wx.BoxSizer(wx.VERTICAL)
    8.         self.text = wx.StaticText(pannel, -1'状态栏目:', size=(10040), pos=(010))
    9.         self.text_input = wx.StaticText(pannel, -1'', size=(90040), pos=(1000))
    10.         wx.StaticText(pannel, -1'当前执行ID:', size=(10030), pos=(065)).SetFont(self.font)
    11.         self.text_time = wx.TextCtrl(pannel, id=self.choices_id_ref, value=self.time_str, size=(30030), pos=(15060),
    12.                                      style=wx.TE_AUTO_URL | wx.TE_MULTILINE)
    13.         self.reflush_text_time = wx.Button(pannel, -1'刷新ID', size=(10050), pos=(48050))
    14.         self.text_time.SetFont(self.font)
    15.         self.reflush_text_time.SetForegroundColour(wx.RED)
    16.         self.reflush_text_time.SetFont(self.font)
    17.         # self.text_time.SetForegroundColour(wx.RED)
    18.         self.text_input.SetBackgroundColour(wx.WHITE)
    19.         self.text_input.SetLabel(self.in_text)
    20.         self.text_input.SetFont(self.font)
    21.         self.text.SetFont(self.font)
    22.         wx.Button(pannel, self.get_product, '获取商品保存本地', size=(200100), pos=(0100)).SetFont(self.font)
    23.         wx.Button(pannel, self.save_mysql, '保存数据库和OSS', size=(200100), pos=(200100)).SetFont(self.font)
    24.         wx.Button(pannel, self.end_process, '结束执行', size=(200100), pos=(400100)).SetFont(self.font)
    25.         self.log_text = wx.TextCtrl(pannel, size=(1000500), pos=(0210), style=wx.TE_MULTILINE | wx.TE_READONLY)
    26.         wx.LogTextCtrl(self.log_text)
    27.         self.Bind(wx.EVT_BUTTON, self.get_product_p, id=self.get_product)
    28.         self.Bind(wx.EVT_BUTTON, self.save_mysql_p, id=self.save_mysql)
    29.         self.Bind(wx.EVT_BUTTON, self.end_process_p, id=self.end_process)
    30.         self.text_time.Bind(wx.EVT_COMMAND_LEFT_CLICK, self.choices_id, id=self.choices_id_ref)
    31.         self.reflush_text_time.Bind(wx.EVT_BUTTON, self.reflush_time_evt)
    32.         self.panel_run.Sizer.Add(pannel, flag=wx.ALL | wx.EXPAND, proportion=1)

         效果图

         3.3编写业务逻辑

         获取商品列表页数据

    1. global _getMainProduct, goods_info
    2. def _getMainProduct(data_url):
    3.     '''
    4.     多线程获取每一页链接
    5.     :param data_url:
    6.     :return:
    7.     '''
    8.     self, url, driver_pool = data_url
    9.     c = Common(driver_pool.getDriver())
    10.     goods_urls = []
    11.     try:
    12.         self.ui.print(f'当前获取第{url}页数据')
    13.         c.d.get(url)
    14.         c.wait_page_loaded(url)
    15.         if self.is_load_cache_cookies:
    16.             self.load_cookies(c.d)
    17.             c.d.get(url)
    18.         c.wait_page_loaded(url)
    19.         ele = c.find_element(By.CSS_SELECTOR, '[class="component-product-list"]')
    20.         goods_urls = ele.find_elements(By.CSS_SELECTOR, 'a[class="product-image"]')
    21.         goods_urls = [goods_url.get_attribute('href'for goods_url in goods_urls]
    22.     except SystemExit:
    23.         sys.exit(1)
    24.     except:
    25.         self.print(f'请求页面超出范围: {url} ERROR: {traceback.format_exc()}')
    26.         if c.find_element_true(By.CSS_SELECTOR, '[class="no-data common"]'):
    27.             return goods_urls
    28.     finally:
    29.         name = c.d.pool_name_driver
    30.         driver_pool.putDriver(name)
    31.         self.queue_print.put(f'请求完成:{url}')
    32.     return goods_urls
    33. def getMainProduct_(self):
    34.     g_dict = globals()
    35.     urls = []
    36.     sum_l = self.pageNums[1] + 1
    37.     complate = 0
    38.     products = []
    39.     for i in range(self.pageNums[0], sum_l):
    40.         if self.ui.is_exit_process:
    41.             exit()
    42.         url = self.url.format(i)
    43.         urls.append([self, url, self.drive_pool])
    44.     if urls:
    45.         p = self.pool.map_async(_getMainProduct, urls)
    46.         while not p.ready():
    47.             if not self.queue_print.empty():
    48.                 complate += 1
    49.                 self.print(self.queue_print.get(), f'完成:{complate}/{sum_l - 1}')
    50.         products = p.get()
    51.     goods_info = set()
    52.     for xx in products:
    53.         for x in xx:
    54.             if x:
    55.                 goods_info.add(x)
    56.     self.goods_info = goods_info
    57.     return goods_info
    58. goods_info = getMainProduct_(self)

         获取详情页数据

    1. global goods,Common,driver_pool,goods_url,sleep,re,By
    2. def get_info_(self, data_info):
    3.     '''
    4.     多线程获取详情页数据
    5.     :param self: 
    6.     :param data_info: 
    7.     :return: 
    8.     '''
    9.     if self.ui.is_exit_process:
    10.         exit()
    11.     goods_url, driver_pool = data_info
    12.     c = Common(driver_pool.getDriver())
    13.     try:
    14.         c.d.get(goods_url)
    15.         sleep(3)
    16.         if self.is_load_cache_cookies:
    17.             self.load_cookies(c.d)
    18.             c.d.get(goods_url)
    19.         c.wait_page_loaded(goods_url)
    20.         for x in range(40018000200):
    21.             sleep(0.1)
    22.             c.d.execute_script(f'document.documentElement.scrollTop={x};')
    23.         is_all = c.find_element_true(By.CSS_SELECTOR, '[id="J-rich-text-description"]')  # 'J-rich-text-description'
    24.         if not is_all:
    25.             self.print(f'没有发现: {is_all}')
    26.         is_video = c.find_elements_true(By.CSS_SELECTOR, '[class="bc-video-player"]>video')
    27.         is_title = c.find_element_true(By.CSS_SELECTOR, '[class="module-pdp-title"]')
    28.         is_description = c.find_element_true(By.CSS_SELECTOR, '[name="description"]')
    29.         is_keywords = c.find_element_true(By.CSS_SELECTOR, '[name="keywords"]')
    30.         is_overview = c.find_element_true(By.CSS_SELECTOR, '[class="do-overview"]')
    31.         is_wz_goods_cat_id = c.find_element_true(By.CSS_SELECTOR, '[class="detail-subscribe"]')
    32.         wz_goods_cat_id = self.wz_goods_cat_id
    33.         # if is_wz_goods_cat_id:
    34.         #     wz_goods_cat_id = is_wz_goods_cat_id.find_elements(By.CSS_SELECTOR, '[class="breadcrumb-item"]>a')[
    35.         #         -1].get_attribute('href')
    36.         #     wz_goods_cat_id = re.search(r'(\d+)', wz_goods_cat_id).group(1)
    37.         # goods_id = re.search(r'(\d+)\.html$', goods_url)
    38.         goods_id = re.search(r'(ssssss\d+)\.html$', goods_url)
    39.         goods = {
    40.             "商品分类ID"int(wz_goods_cat_id) if wz_goods_cat_id else 0,
    41.             "商品ID": goods_id.group(1if goods_id else self.getMd5(f'{time.time()}')+'其他',
    42.             "商品链接": goods_url,
    43.             "描述": c.find_element(By.CSS_SELECTOR, '[name="description"]').get_attribute(
    44.                 'content'if is_description else '',
    45.             "标题": is_title.get_attribute('title'if is_title else '',
    46.             "关键字": c.find_element(By.CSS_SELECTOR, '[name="keywords"]').get_attribute(
    47.                 'content'if is_keywords else is_keywords,
    48.             "视频连接": c.find_element(By.CSS_SELECTOR, '[class="bc-video-player"]>video').get_attribute(
    49.                 'src'if is_video else '',
    50.             "主图片": [],
    51.             "商品详情": c.d.execute_script(
    52.                 '''return document.querySelectorAll('[class="do-overview"]')[0].outerHTML;'''if is_overview else is_overview,
    53.             "商品描述"'',
    54.             "商品描述图片": []
    55.         }
    56.         # 获取商品描述图片
    57.         goods_desc = getDescriptionFactory1(self, c, goods_url)
    58.         goods.update(goods_desc)
    59.         # 获取主图片
    60.         m_imgs = c.find_elements(By.CSS_SELECTOR, '[class="main-image-thumb-ul"]>li')
    61.         for m_img in m_imgs:
    62.             try:
    63.                 img = m_img.find_element(By.CSS_SELECTOR, '[class="J-slider-cover-item"]').get_attribute('src')
    64.                 s = re.search('(\d+x\d+)', img)
    65.                 img2 = None
    66.                 if s:
    67.                     img2 = str(img).replace(s.group(1), '')
    68.                 goods['主图片'].append(img)
    69.                 if img2:
    70.                     goods['主图片'].append(img2)
    71.             except:
    72.                 pass
    73.         self.ui.status['请求成功商品数量'] += 1
    74.         return goods
    75.     except:
    76.         traceback.print_exc()
    77.         self.print(f'=========================\n链接请求错误: {goods_url} \n {traceback.format_exc()}\n=========================')
    78.         self.error_page.append([goods_url, traceback.format_exc()])
    79.         self.ui.status['请求失败商品数量'] += 1
    80.     finally:
    81.         name = c.d.pool_name_driver
    82.         driver_pool.putDriver(name)
    83.         self.queue_print.put(f'请求完成:{goods_url}')
    84. goods = get_info_(self,data_info)

         写入excel

    1.         def export_excel(self, results):
    2.         '''
    3.         写入excel方法
    4.         :param results: 
    5.         :return: 
    6.         '''
    7.         now_dir_str = self.now
    8.         now_file_str = time.strftime('%Y_%m_%d__%H_%M_%S', time.localtime())
    9.         img_path = os.path.join('data''xls', now_dir_str)
    10.         if not os.path.exists(img_path):
    11.             os.mkdir(img_path)
    12.         img_path = os.path.join('data''xls', now_dir_str, self.url_id)
    13.         if not os.path.exists(img_path):
    14.             os.mkdir(img_path)
    15.         if not os.path.exists(img_path):
    16.             os.mkdir(img_path)
    17.         img_path = os.path.join(img_path, f"{now_file_str}.xlsx")
    18.         workbook = xlsxwriter.Workbook(img_path)
    19.         sheet = workbook.add_worksheet(name='阿里巴巴信息')
    20.         titles = list(results[0].keys())
    21.         for i, title in enumerate(titles):
    22.             sheet.write_string(0, i, title)
    23.         for row, result in enumerate(results):
    24.             row = row + 1
    25.             col = 0
    26.             for value in result.values():
    27.                 sheet.write_string(row, col, str(value))
    28.                 col += 1
    29.         workbook.close()

    4、最后总结:

        由于通用selenium执行浏览器操作没有接口请求效率高,所以在最后使用了多线程在执行效率上也做了一些提升。

                                我是政胤 期待你的关注

  • 相关阅读:
    JavaScript学习笔记01
    Leetcode算法题练习(一)
    车载电子电器架构 —— 电气架构开发计划
    Git--分布式版本控制工具
    hudi系列-借助hudi优化架构
    BUGKU CTF WE篇(二)
    CMake官方教程3--对库加入使用限制
    实现在外网SSH远程访问内网树莓派的详细教程
    (2022版)一套教程搞定k8s安装到实战 | PV/PVC
    009 springboot整合mybatis-plus 增删改查 ajax 登录退出accessToken
  • 原文地址:https://blog.csdn.net/m0_69043821/article/details/125491644