• 爬虫爬取人民网


    1. import scrapy
    2. from scrapy.linkextractors import LinkExtractor
    3. from scrapy.spiders import CrawlSpider, Rule
    4. from scrapy_readbook.items import ScrapyReadbookItem
    5. class ReadSpider(CrawlSpider):
    6. name = "read"
    7. output_file = "output.txt"
    8. # allowed_domains = ["www.dushu.com"]
    9. # start_urls = ["https://www.dushu.com/book/1188_1.html"]
    10. allowed_domains = ["finance.people.com.cn"]
    11. start_urls = ["http://finance.people.com.cn/n1/2023/1024/c1004-40101769.html"]
    12. # rules = (Rule(
    13. # # LinkExtractor(allow=r"/book/1188_\d+.html"),
    14. # LinkExtractor(allow=r".*"),
    15. # callback="parse_item",
    16. # follow=False),)
    17. rules = (
    18. Rule(
    19. LinkExtractor(allow=r"http://finance\.people\.com\.cn/n1/2023/1024/c1004-40101769\.html"),
    20. callback="parse_item",
    21. follow=False
    22. ),
    23. )
    24. def parse_item(self, response):
    25. # img_list = response.xpath('//div[@class="bookslist"]//img')
    26. # img_list = response.xpath('//div[@class="layout rm_txt cf"]')
    27. # print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
    28. # for img in img_list:
    29. # title = img.xpath('.//h1/text()').extract_first()
    30. # content = img.xpath('.//a/text()').extract_first()
    31. # print(title)
    32. # print(content)
    33. print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
    34. title = response.xpath('/html/body/div[1]/div[7]/div[1]/h1/text()').extract_first()
    35. # content = response.xpath('/html/body/div[1]/div[7]/div[1]/div[3]//p/text()').extract_first()
    36. content_list = response.xpath('//p/text()').extract()
    37. content = '\n'.join(content_list)
    38. date = response.xpath('/html/body/div[1]/div[7]/div[1]/div[2]/div[1]/text()').extract_first()
    39. source = response.xpath('/html/body/div[1]/div[7]/div[1]/div[2]/div[1]/a/text()').extract()
    40. # book = ScrapyReadbookItem(title=title, content=content)
    41. #
    42. # yield book
    43. print(title)
    44. print(content)
    45. print(date)
    46. print(source)
    47. with open(self.output_file, "a", encoding="utf-8") as file:
    48. file.write(title + "\n")
    49. file.write(content + "\n")
    50. file.write(date + "\n")
    51. file.write(source + "\n")

  • 相关阅读:
    min_max_gray
    大数据flink篇之三-flink运行环境安装后续一yarn-session安装
    我眼中的大数据(三)——MapReduce
    【C语言基础】:操作符详解(一)
    深度学习在图像识别中的革命性应用
    2--Linux:基础命令
    Java跨模块无法扫描到controller问题解决
    GDP-L-岩藻糖二钠盐,GDP-fucose ,6-Deoxy-β-L-galactopyranosylguanosine 5′-diphosphate
    Github上都在疯找的京东内部“架构师进阶手册”终于来了
    2023年内网穿透常用的几个工具
  • 原文地址:https://blog.csdn.net/weixin_63403986/article/details/134020519