#需求:抓取猫眼电影TOP100的电影名称、时间、评分、图片等信息,提取的结果会以文件的形式保存下来
    import requests
    import time
    from lxml import etree
    import json
    import csv
    import codecs
    
    class MaoYanTop100Spider:
        #存储电影详情页的url
        film_page_url_list = []
        #存储每个的电影信息
        #film_info = {}
        film_info_list = []
    
        # 1.获取电影列表页数据
        def Top100_list(self, session, headers):
            #1.1向列表页发送请求
            #https://maoyan.com/board/4?offset=20
            #(1)固定url
            base_url = "https://maoyan.com/board/4"
            #(2)url变化部分:
            for i in range(0, 91, 10):
                #(3)拼接URL:
                final_url = base_url + "?offset=" + str(i)
                #(4)发送请求:
                time.sleep(5)
                response = session.get(url=final_url, headers=headers)
                #1.2解析列表页
                film_list_page_data = response.content.decode("utf-8")
                #1.2.1使用xpath解析数据
                #(1)转类型
                xpath_data = etree.HTML(film_list_page_data)
                #(2)
                #/dl/dd[1]/a
                #dl/dd[10]/a/img[2]
                # dl / dd[2] / a / img[2]
                for xpath_num in range(1, 11):
                    # 电影名称
                    #dl/dd[1]/a
                    film_name = xpath_data.xpath('//dl[@class="board-wrapper"]/dd[' + str(xpath_num) + ']/a/@title')[0]
    
                    # 时间
                    #//*[@id="app"]/div/div/div[1]/dl/dd[1]/div/div/div[1]/p[3]
                    #dl/dd[1]/div/div/div[1]/p[3]
                    #dl/dd[2]/div/div/div[1]/p[3]
                    film_time = xpath_data.xpath('//dl[@class="board-wrapper"]/dd[' + str(xpath_num) + ']/div/div/div[1]/p[3]/text()')[0][5:].strip()
    
                    # 主演
                    #dl/dd[1]/div/div/div[1]/p[2]
                    film_actors = xpath_data.xpath('//dl[@class="board-wrapper"]/dd[' + str(xpath_num) + ']/div/div/div[1]/p[2]/text()')[0].strip()[3:]
    
                    # 评分
                    #dl/dd[1]/div/div/div[2]/p/i[1]
                    score_int = xpath_data.xpath('//dl[@class="board-wrapper"]/dd[' + str(xpath_num) + ']/div/div/div[2]/p/i[1]/text()')[0]
                    #dl/dd[1]/div/div/div[2]/p/i[2]
                    score_fraction = xpath_data.xpath('//dl[@class="board-wrapper"]/dd[' + str(xpath_num) + ']/div/div/div[2]/p/i[2]/text()')[0]
                    film_score = str(score_int) + str(score_fraction)
                    # 图片
                    #dl/dd[1]/a/img[2]
                    #dl/dd[1]/a/img[2]
                    film_img = xpath_data.xpath('//dl[@class="board-wrapper"]/dd[' + str(xpath_num) + ']/a/img[2]/@data-src')[0]
    
                    # 详情页url
                    #dl/dd[1]/div/div/div[1]/p[1]/a
                    #dl/dd[1]/div/div/div[1]/p[1]/a
                    film_url = xpath_data.xpath('//dl[@class="board-wrapper"]/dd[' + str(xpath_num) + ']/div/div/div[1]/p[1]/a/@href')[0]
    
                    #电影信息
                    film_info = {}
                    film_info["name"] = film_name
                    film_info["time"] = film_time
                    film_info["actors"] = film_actors
                    film_info["score"] = film_score
                    film_info["img"] = film_img
                    film_info["url"] = film_url
                    self.film_info_list.append(film_info)
                    #print(film_info)
                    #详情页url
                    self.film_page_url_list.append(film_url)
    
    
    
    
    
        # 2.获取电影详情页数据
        def film_page(self, url, session, headers, num):
            #2.1向详情页发送请求
            base_url = "https://maoyan.com"
            final_url = base_url + str(url)
            print(final_url)
            time.sleep(3)
            response = session.get(url=final_url, headers=headers)
            data = response.content.decode("utf-8")
            #print(response)
            #2.2解析详情页
            xpath_data = etree.HTML(data)
            #//*[@id="app"]/div/div[1]/div/div[2]/div[1]/div[1]/div[2]/span
            film_summary = xpath_data.xpath('//span[@class="dra"]/text()')[0].strip()
            #print(film_summary)
            self.film_info_list[num]["summary"] = film_summary
    
        #将数据保存至CSV文件
        def save_data(self):
            #1.读取json文件,创建csv文件
            #json_fp = open("new.json", "r")
            csv_fp = codecs.open("maoyan.csv", "w", "utf-8")
            #2.提出csv文件表头,表内容
            #2.1 表头
            #data_list = json.load(json_fp)
            title_list = self.film_info_list[0].keys()
    
            #2.2 表内容
            excel_data = []
            for data in self.film_info_list:
                excel_data.append(data.values())
            #3.使用csv写入器,写入文件
            #3.1创建csv写入器
            csv_writer = csv.writer(csv_fp)
            #3.2写入表头和表内容
            #(1)写入表头
            csv_writer.writerow(title_list)
            #(2)写入表内容
            csv_writer.writerows(excel_data)
            #4.关闭csv文件和json文件
            #json_fp.close()
            csv_fp.close()
    
        #运行:
        def run(self):
            #0.创建session,维持会话
            session = requests.Session()
            #0.1请求头:headers
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
            }
            #1.获取电影列表页数据
            self.Top100_list(session=session, headers=headers)
            #print(self.film_info_list)
            #2.获取电影详情页数据
            for i, film_page_url in enumerate(self.film_page_url_list):
                self.film_page(url=film_page_url, session=session, headers=headers, num=i)
                print(self.film_info_list[i])
    
            #3.保存数据
            self.save_data()
    
    
    
    MaoYanTop100Spider().run()
    
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150