37.5月29日 综合项目:爬取并分析知名B站UP主-2
火烧云数据-B站数据分析平台-B站营销增长 (hsydata.com)
没时间,未完成,需修改
以下代码跑不了
只是关键点参考
import pandas as pd
from collections import Counter
import ast
import matplotlib.pyplot as plt
from wordcloud import WordCloud
df = pd.read_csv('小约翰可汗.csv')
df=df.dropna()
# 做词云
def word_cloud():
all_tags = []
for item_list in df['标签'].dropna(): # 使用 dropna() 确保忽略空值
tags_list = ast.literal_eval(item_list)
for tag in tags_list:
all_tags.append(tag)
tags_counts = Counter(all_tags)
print(tags_counts)
wordcloud = WordCloud(
font_path='C:\\Windows\\Fonts\\HGWT_CNKI.TTF',
width=1200,
height=800,
background_color='white'
)
wordcloud.generate_from_frequencies(tags_counts)
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off') # 去除坐标轴
plt.show()
def dd():
# 做播放点赞比
df_no_na=df.dropna()
df_no_na['播放点赞比']=df_no_na['点赞数']/df_no_na['播放数']
# print(df_no_na['播放点赞比'])
df_no_na['播放投币比']=df_no_na['投币数']/df_no_na['播放数']
# print(df_no_na['播放投币比'])
# 对播放点赞比进行降序排序
df_sorted = df_no_na.sort_values(by='播放点赞比', ascending=False)
plt.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
plt.figure(figsize=(10, 5)) # 可以调整图形大小
df_top = df_sorted.head(10)
plt.plot(df_top['名称'], df_top['播放点赞比'], marker='o') # 使用圆点标记每个数据点
plt.plot(df_top['名称'], df_top['播放投币比'], marker='o') # 使用圆点标记每个数据点
# 添加标题和标签
plt.title('Sorted Like-to-Play Ratio')
plt.xlabel('Video Titles')
plt.ylabel('Like-to-Play Ratio')
# 旋转x轴标签,以免重叠
plt.xticks(rotation=45, ha='right',size=8)
# 展示图形
plt.show()
def sjbx():
# 计算总和
total_plays = df['播放数'].sum()
total_likes = df['点赞数'].sum()
total_coins = df['投币数'].sum()
# 计算平均值
average_plays = total_plays / len(df)
average_likes = total_likes / len(df)
average_coins = total_coins / len(df)
# 计算平均点赞与播放的比例
like_to_play_ratio = average_likes / average_plays
# 计算平均投币与播放的比例
coin_to_play_ratio = average_coins / average_plays
# 输出结果
print(f"总播放数: {total_plays / 10000:.2f}万")
print(f"总点赞数: {total_likes / 10000:.2f}万")
print(f"总投资币数: {total_coins / 10000:.2f}万")
print(f"平均播放数: {average_plays / 10000:.2f}万")
print(f"平均点赞数: {average_likes / 10000:.2f}万")
print(f"平均投币数: {average_coins / 10000:.2f}万")
print(f"平均点赞与播放的比例: {like_to_play_ratio * 100 :.2f}%")
print(f"平均投币与播放的比例: {coin_to_play_ratio * 100 :.2f}%")
for index, row in df.iterrows():
play = row['播放数']
like = row['点赞数']
coin = row['投币数']
tag_list = ast.literal_eval(row['标签'])
def tag_statistics():
p_tag_dict = {}
l_tag_dict = {}
c_tag_dict = {}
for index, row in df.iterrows():
play = row['播放数']
like = row['点赞数']
coin = row['投币数']
tag_list = ast.literal_eval(row['标签'])
for tag in tag_list:
# 如果标签已存在于字典中,则累加播放数
if tag in p_tag_dict:
p_tag_dict[tag] += play
l_tag_dict[tag] += play
c_tag_dict[tag] += play
# 如果标签不存在于字典中,则添加到字典并设置其播放数
else:
p_tag_dict[tag] = play
l_tag_dict[tag] = play
c_tag_dict[tag] = play
p_tag_df = pd.DataFrame(list(p_tag_dict.items()), columns=['标签', '播放数'])
l_tag_df = pd.DataFrame(list(l_tag_dict.items()), columns=['标签', '点赞'])
c_tag_df = pd.DataFrame(list(c_tag_dict.items()), columns=['标签', '投币'])
tag_play_df_top = tag_play_df.sort_values(by='播放数', ascending=False).head(20)
excluded_tags = ['历史', '人文', '人物']
tag_play_df_top = tag_play_df_top[~tag_play_df_top['标签'].isin(excluded_tags)]
# print(tag_play_df)
plt.rcParams['font.family'] = 'SimHei'
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(figsize=(10, 8)) # 设置图形的大小
plt.bar(tag_play_df_top['标签'], tag_play_df_top['播放数'], color='skyblue') # 创建条形图
# 添加标题和标签
plt.title('标签播放数统计')
plt.xlabel('Tag')
plt.ylabel('播放数')
# 显示图形
plt.show()
本文是原创文章,采用 CC BY-NC-ND 4.0 协议,完整转载请注明来自 现代职校董良
评论
匿名评论
隐私政策
你无需删除空行,直接评论以获取最佳展示效果