微博爬取长津湖博文及评论

710阅读 0评论2021-10-08 专注的阿熊
分类:Python/Ruby

# Changjin Lake film data from Weibo

# @Time: 20211006

# @Author: heheyang

import requests

import json

import re

import pprint

import pandas as pd

def comments_singlePage_crawl(url,headers,comments_info, id):

    """

    评论单页爬取

    :param url:

    :param headers:

    :return:

    """

    # 获取html

    html = requests.get(url, headers).text

    # json解析html

    html_dict = json.loads(html)

    comments_data = html_dict["data"]["data"]

    for comment in comments_data:

        comments_info["id"].append(id)

        comments_info["date"].append(comment["created_at"])

        # 筛选出text中的文本信息

        text = re.sub("", "", comment["text"])

        text = re.sub("", "", text)

        comments_info["text"].append(text)

def weibo_bowen_singelPage_crawl(url,headers,mblog_info,comments_info):

    """

    单页爬取函数

    :param url: 待爬取url

    :param headers: 请求头

    :param mblog_info: mblog信息存储字典

    """

    # 获取html

    html = requests.get(url,headers).text

    # json解析html

    html_dict = json.loads(html)

    users = html_dict["data"]["cards"]

    # 博文存储

    for user in users:

        mblog = user["mblog"]

        mblog_info["id"].append(mblog["id"])

        mblog_info["date"].append(mblog["created_at"])

        # 筛选出text中的文本信息

        text = re.sub("","",mblog["text"])

        text = re.sub("","",text)

        mblog_info["text"].append(text)

        # 构造评论的url

        comments_url = "%s&mid=%s&max_id_type=" % (mblog["id"], mblog["id"])

        # 保存评论

        i = 0

        while True:

            try:

                comments_url_ = comments_url + str(i)

                comments_singlePage_crawl(comments_url_, headers, comments_info, mblog["id"])

                i += 1

            except:

                break

        pprint.pprint(comments_info)

def weibo_bowen_data_crawl(url,headers):

    """

    博文信息爬取函数

    :param url: 待爬取网站url

    :param headers: 请求头

    :return: 博文信息存储字典mblog_info

    """

    # 博文信息存储字典

    mblog_info = {

        "id": [],

        "date": [],

        "text": []

    }

    # 评论信息保存字典

    comments_info = {

        "id":[],

        "date":[],

        "text":[],

    }

    # 爬取10页博文

    for i in range(1,10):

        url_ = url + str(i)

        # 外汇跟单gendan5.com添加博文信息

        weibo_bowen_singelPage_crawl(url_, headers, mblog_info,comments_info)

    return mblog_info,comments_info

def bowen_data_store(mblog_info,comments_info):

    """

    数据处理并保存到excel

    :param mblog_info: 博文信息

    :return: 保存到excel

    """

    # 保存表1

    data = pd.DataFrame(mblog_info)

    data["num"] = data.index + 1

    data["keyword"] = ["Film Changjin Lake"]*len(data["num"])

    df = data.loc[:,["num","keyword","id","date","text"]]

    df.to_excel("bowen_data.xlsx",sheet_name="Sheet1")

    #保存表2

    comments_data = pd.DataFrame(comments_info)

    comments_data["num"] = comments_data.index + 1

    df_c = comments_data.loc[:,["num","id","date","text"]]

    df_c.to_excel("bowen_comments_data.xlsx",sheet_name="Sheet1")

if __name__ == '__main__':

    # 微博url

    url =  "%3D1%26q%3D%E9%95%BF%E6%B4%A5%E6%B9%96&type=uid&value=7377392724&containerid=1076037377392724&page=" # 长津湖微博

    # 请求头

    headers = {

        "cookie":"自行添加",

        "user-agent":"自行添加"

    }

    mblog_info,comments_info = weibo_bowen_data_crawl(url,headers)

    bowen_data_store(mblog_info,comments_info)

上一篇:NLP 中文分词-双向匹配算法(理论+Python实现)
下一篇:苏宁易购网址爬虫爬取商品信息及图片