forked from TurboWay/bigdata_analyse
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
way
committed
Dec 25, 2020
1 parent
57a196c
commit 61920cc
Showing
12 changed files
with
6,056 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
/.idea/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
--1.新增用户分析 | ||
|
||
-- 新增用户总量 | ||
select count(1) from age_of_barbarians; --3116941 | ||
|
||
-- PU ( Paying Users):付费用户总量 | ||
select count(1) from age_of_barbarians where pay_price > 0; --60988 | ||
|
||
-- DNU(Daily New Users): 每日游戏中的新登入用户数量,即每日新用户数。 | ||
``` | ||
点击:点击广告页或者点击广告链接数 | ||
下载:点击后成功下载用户数 | ||
安装:下载程序并成功安装用户数 | ||
激活:成功安装并首次激活应用程序 | ||
注册:产生user_id | ||
DNU:产生user_id并且首次登陆 | ||
``` | ||
select cast(register_time as date) as day, | ||
count(1) as dnu, | ||
sum(case when pay_price > 0 then 1 else 0 end ) as dnpu | ||
from age_of_barbarians | ||
group by cast(register_time as date) | ||
order by day; | ||
|
||
-- 每小时的新登入用户数量 | ||
select hour(cast(register_time as datetime)) as hour, | ||
count(1) as dnu, | ||
sum(case when pay_price > 0 then 1 else 0 end ) as dnpu | ||
from age_of_barbarians | ||
group by hour(cast(register_time as datetime)) | ||
order by hour; | ||
|
||
|
||
--2.用户活跃度分析 | ||
|
||
-- DAU、WAU、MAU(Daily Active Users、Weekly Active Users、Monthly Active Users):每日、每周、每月登陆游戏的用户数,一般为自然周与自然月。 | ||
|
||
-- 平均在线时长 | ||
select avg(avg_online_minutes) from age_of_barbarians; --10.615346021266106 | ||
|
||
-- 付费玩家的平均在线时长 | ||
select avg(avg_online_minutes) from age_of_barbarians where pay_price > 0; --138.80478126869235 | ||
|
||
|
||
--3.玩家付费情况分析 | ||
|
||
-- APA(Active Payment Account):活跃付费用户数。 | ||
select count(1) as APA from age_of_barbarians where pay_price > 0 and avg_online_minutes > 0; --60987 | ||
|
||
-- ARPU(Average Revenue Per User) :平均每用户收入。 | ||
select sum(pay_price)/sum(case when avg_online_minutes > 0 then 1 else 0 end) from age_of_barbarians; --0.5824066558640159 | ||
|
||
-- ARPPU (Average Revenue Per Paying User): 平均每付费用户收入。 | ||
select sum(pay_price)/sum(case when avg_online_minutes > 0 and pay_price > 0 then 1 else 0 end) from age_of_barbarians; --29.190265138469332 | ||
|
||
-- PUR(Pay User Rate):付费比率,可通过 APA/AU 计算得出。 | ||
select sum(case when avg_online_minutes > 0 and pay_price > 0 then 1 else 0 end) / sum(case when avg_online_minutes > 0 then 1 else 0 end) | ||
from age_of_barbarians; --0.019952085159256484 | ||
|
||
-- 付费玩家人数,付费总额,付费总次数,平均每人付费,平均每人付费次数,平均每次付费 | ||
select count(1) as pu, --60988 | ||
sum(pay_price) as sum_pay_price, --1780226.6999998293 | ||
avg(pay_price) as avg_pay_price, --29.189786515377275 | ||
sum(pay_count) as sum_pay_count, --193030.0 | ||
avg(pay_count) as avg_pay_count, --3.165048862071227 | ||
sum(pay_price) / sum(pay_count) as each_pay_price --9.222538983576797 | ||
from age_of_barbarians | ||
where pay_price > 0; | ||
|
||
|
||
--4.玩家习惯分析 | ||
|
||
--胜率 | ||
select sum(pvp_win_count) / sum(pvp_battle_count) as pvp_win_rate, --玩家pvp胜率 | ||
sum(case when pay_price > 0 then pvp_win_count else 0 end) / sum(case when pay_price > 0 then pvp_battle_count else 0 end) as pve_win_rate_pay, --付费玩家pve胜率 | ||
sum(case when pay_price = 0 then pvp_win_count else 0 end) / sum(case when pay_price = 0 then pvp_battle_count else 0 end) as pve_win_rate_nor, --非付费玩家pve胜率 | ||
sum(pve_win_count) / sum(pve_battle_count) as pve_win_rate, --玩家pve胜率 | ||
sum(case when pay_price > 0 then pve_win_count else 0 end) / sum(case when pay_price > 0 then pve_battle_count else 0 end) as pve_win_rate_pay, --付费玩家pve胜率 | ||
sum(case when pay_price = 0 then pve_win_count else 0 end) / sum(case when pay_price = 0 then pve_battle_count else 0 end) as pve_win_rate_nor --非付费玩家pve胜率 | ||
from age_of_barbarians; | ||
|
||
--pvp场次 | ||
select sum(pvp_battle_count) as pvp_battle_count, --玩家pvp场次 | ||
avg(pvp_battle_count) as pvp_battle_count_avg, --玩家平均pvp场次 | ||
sum(case when pay_price > 0 then pvp_battle_count else 0 end) as pvp_battle_count_pay, --付费玩家pvp场次 | ||
sum(case when pay_price > 0 then pvp_battle_count else 0 end) / sum(case when pay_price > 0 then 1 else 0 end), --付费玩家平均pvp场次 | ||
sum(case when pay_price = 0 then pvp_battle_count else 0 end) as pvp_battle_count_nor, --非付费玩家pvp场次 | ||
sum(case when pay_price = 0 then pvp_battle_count else 0 end) / sum(case when pay_price = 0 then 1 else 0 end) --非付费玩家平均pvp场次 | ||
from age_of_barbarians; | ||
|
||
--pve场次 | ||
select sum(pve_battle_count) as pve_battle_count, --玩家pve场次 | ||
avg(pve_battle_count) as pve_battle_count_avg, --玩家平均pve场次 | ||
sum(case when pay_price > 0 then pve_battle_count else 0 end) as pve_battle_count_pay, --付费玩家pve场次 | ||
sum(case when pay_price > 0 then pve_battle_count else 0 end) / sum(case when pay_price > 0 then 1 else 0 end), --付费玩家平均pve场次 | ||
sum(case when pay_price = 0 then pve_battle_count else 0 end) as pve_battle_count_nor, --非付费玩家pvp场次 | ||
sum(case when pay_price = 0 then pve_battle_count else 0 end) / sum(case when pay_price = 0 then 1 else 0 end) --非付费玩家平均pve场次 | ||
from age_of_barbarians; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
/* | ||
说明:手游《野蛮时代》玩家数据 | ||
来源:https://js.dclab.run/v2/cmptDetail.html?id=226 | ||
大小:642 M (训练集)+ 219 M(测试集)= 861 M | ||
记录数:2,288,007 (训练集)+ 828,934(测试集)= 3,116,941 | ||
字段数:109 | ||
*/ | ||
|
||
-- 建表 | ||
drop table if exists age_of_barbarians; | ||
create table age_of_barbarians ( | ||
`user_id` string comment '玩家唯一ID', | ||
`register_time` string comment '玩家注册时间', | ||
`wood_add_value` string comment '木头获取数量', | ||
`wood_reduce_value` string comment '木头消耗数量', | ||
`stone_add_value` string comment '石头获取数量', | ||
`stone_reduce_value` string comment '石头消耗数量', | ||
`ivory_add_value` string comment '象牙获取数量', | ||
`ivory_reduce_value` string comment '象牙消耗数量', | ||
`meat_add_value` string comment '肉获取数量', | ||
`meat_reduce_value` string comment '肉消耗数量', | ||
`magic_add_value` string comment '魔法获取数量', | ||
`magic_reduce_value` string comment '魔法消耗数量', | ||
`infantry_add_value` string comment '勇士招募数量', | ||
`infantry_reduce_value` string comment '勇士损失数量', | ||
`cavalry_add_value` string comment '驯兽师招募数量', | ||
`cavalry_reduce_value` string comment '驯兽师损失数量', | ||
`shaman_add_value` string comment '萨满招募数量', | ||
`shaman_reduce_value` string comment '萨满损失数量', | ||
`wound_infantry_add_value` string comment '勇士伤兵产生数量', | ||
`wound_infantry_reduce_value` string comment '勇士伤兵恢复数量', | ||
`wound_cavalry_add_value` string comment '驯兽师伤兵产生数量', | ||
`wound_cavalry_reduce_value` string comment '驯兽师伤兵恢复数量', | ||
`wound_shaman_add_value` string comment '萨满伤兵产生数量', | ||
`wound_shaman_reduce_value` string comment '萨满伤兵恢复数量', | ||
`general_acceleration_add_value` string comment '通用加速获取数量', | ||
`general_acceleration_reduce_value` string comment '通用加速使用数量', | ||
`building_acceleration_add_value` string comment '建筑加速获取数量', | ||
`building_acceleration_reduce_value` string comment '建筑加速使用数量', | ||
`reaserch_acceleration_add_value` string comment '科研加速获取数量', | ||
`reaserch_acceleration_reduce_value` string comment '科研加速使用数量', | ||
`training_acceleration_add_value` string comment '训练加速获取数量', | ||
`training_acceleration_reduce_value` string comment '训练加速使用数量', | ||
`treatment_acceleraion_add_value` string comment '治疗加速获取数量', | ||
`treatment_acceleration_reduce_value` string comment '治疗加速使用数量', | ||
`bd_training_hut_level` string comment '建筑:士兵小屋等级', | ||
`bd_healing_lodge_level` string comment '建筑:治疗小井等级', | ||
`bd_stronghold_level` string comment '建筑:要塞等级', | ||
`bd_outpost_portal_level` string comment '建筑:据点传送门等级', | ||
`bd_barrack_level` string comment '建筑:兵营等级', | ||
`bd_healing_spring_level` string comment '建筑:治疗之泉等级', | ||
`bd_dolmen_level` string comment '建筑:智慧神庙等级', | ||
`bd_guest_cavern_level` string comment '建筑:联盟大厅等级', | ||
`bd_warehouse_level` string comment '建筑:仓库等级', | ||
`bd_watchtower_level` string comment '建筑:瞭望塔等级', | ||
`bd_magic_coin_tree_level` string comment '建筑:魔法幸运树等级', | ||
`bd_hall_of_war_level` string comment '建筑:战争大厅等级', | ||
`bd_market_level` string comment '建筑:联盟货车等级', | ||
`bd_hero_gacha_level` string comment '建筑:占卜台等级', | ||
`bd_hero_strengthen_level` string comment '建筑:祭坛等级', | ||
`bd_hero_pve_level` string comment '建筑:冒险传送门等级', | ||
`sr_scout_level` string comment '科研:侦查等级', | ||
`sr_training_speed_level` string comment '科研:训练速度等级', | ||
`sr_infantry_tier_2_level` string comment '科研:守护者', | ||
`sr_cavalry_tier_2_level` string comment '科研:巨兽驯兽师', | ||
`sr_shaman_tier_2_level` string comment '科研:吟唱者', | ||
`sr_infantry_atk_level` string comment '科研:勇士攻击', | ||
`sr_cavalry_atk_level` string comment '科研:驯兽师攻击', | ||
`sr_shaman_atk_level` string comment '科研:萨满攻击', | ||
`sr_infantry_tier_3_level` string comment '科研:战斗大师', | ||
`sr_cavalry_tier_3_level` string comment '科研:高阶巨兽骑兵', | ||
`sr_shaman_tier_3_level` string comment '科研:图腾大师', | ||
`sr_troop_defense_level` string comment '科研:部队防御', | ||
`sr_infantry_def_level` string comment '科研:勇士防御', | ||
`sr_cavalry_def_level` string comment '科研:驯兽师防御', | ||
`sr_shaman_def_level` string comment '科研:萨满防御', | ||
`sr_infantry_hp_level` string comment '科研:勇士生命', | ||
`sr_cavalry_hp_level` string comment '科研:驯兽师生命', | ||
`sr_shaman_hp_level` string comment '科研:萨满生命', | ||
`sr_infantry_tier_4_level` string comment '科研:狂战士', | ||
`sr_cavalry_tier_4_level` string comment '科研:龙骑兵', | ||
`sr_shaman_tier_4_level` string comment '科研:神谕者', | ||
`sr_troop_attack_level` string comment '科研:部队攻击', | ||
`sr_construction_speed_level` string comment '科研:建造速度', | ||
`sr_hide_storage_level` string comment '科研:资源保护', | ||
`sr_troop_consumption_level` string comment '科研:部队消耗', | ||
`sr_rss_a_prod_levell` string comment '科研:木材生产', | ||
`sr_rss_b_prod_level` string comment '科研:石头生产', | ||
`sr_rss_c_prod_level` string comment '科研:象牙生产', | ||
`sr_rss_d_prod_level` string comment '科研:肉类生产', | ||
`sr_rss_a_gather_level` string comment '科研:木材采集', | ||
`sr_rss_b_gather_level` string comment '科研:石头采集', | ||
`sr_rss_c_gather_level` string comment '科研:象牙采集', | ||
`sr_rss_d_gather_level` string comment '科研:肉类生产', | ||
`sr_troop_load_level` string comment '科研:部队负重', | ||
`sr_rss_e_gather_level` string comment '科研:魔法采集', | ||
`sr_rss_e_prod_level` string comment '科研:魔法生产', | ||
`sr_outpost_durability_level` string comment '科研:据点耐久', | ||
`sr_outpost_tier_2_level` string comment '科研:据点二', | ||
`sr_healing_space_level` string comment '科研:医院容量', | ||
`sr_gathering_hunter_buff_level` string comment '科研:领土采集奖励', | ||
`sr_healing_speed_level` string comment '科研:治疗速度', | ||
`sr_outpost_tier_3_level` string comment '科研:据点三', | ||
`sr_alliance_march_speed_level` string comment '科研:联盟行军速度', | ||
`sr_pvp_march_speed_level` string comment '科研:战斗行军速度', | ||
`sr_gathering_march_speed_level` string comment '科研:采集行军速度', | ||
`sr_outpost_tier_4_level` string comment '科研:据点四', | ||
`sr_guest_troop_capacity_level` string comment '科研:增援部队容量', | ||
`sr_march_size_level` string comment '科研:行军大小', | ||
`sr_rss_help_bonus_level` string comment '科研:资源帮助容量', | ||
`pvp_battle_count` string comment 'PVP次数', | ||
`pvp_lanch_count` string comment '主动发起PVP次数', | ||
`pvp_win_count` string comment 'PVP胜利次数', | ||
`pve_battle_count` string comment 'PVE次数', | ||
`pve_lanch_count` string comment '主动发起PVE次数', | ||
`pve_win_count` string comment 'PVE胜利次数', | ||
`avg_online_minutes` string comment '在线时长', | ||
`pay_price` string comment '付费金额', | ||
`pay_count` string comment '付费次数', | ||
`prediction_pay_price` string comment '45日付费金额' ) | ||
row format delimited | ||
fields terminated by ',' | ||
lines terminated by '\n'; | ||
|
||
-- 加载数据 | ||
LOAD DATA LOCAL INPATH '/home/getway/tap_fun_test.csv' | ||
INTO TABLE age_of_barbarians ; | ||
LOAD DATA LOCAL INPATH '/home/getway/tap_fun_train.csv' | ||
INTO TABLE age_of_barbarians ; | ||
|
||
-- 去掉 csv 的标题数据 | ||
insert overwrite table age_of_barbarians | ||
select * from age_of_barbarians | ||
where user_id <> 'user_id'; | ||
|
||
-- 查看数据重复性(基于 user_id 唯一,无重复记录) | ||
select count(1), count(distinct user_id) | ||
from age_of_barbarians ; | ||
|
||
-- 查看基本数据样例 | ||
select user_id, register_time, avg_online_minutes , pay_price, pay_count | ||
from age_of_barbarians | ||
limit 100 ; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,25 @@ | ||
# bigdata_analyse | ||
大数据分析 | ||
大数据分析项目 | ||
|
||
## wish | ||
|
||
通过对不同行业的数据集进行分析,期望达到以下目标: | ||
|
||
- 了解不同领域的业务分析指标 | ||
- 提高数据处理、数据分析能力 | ||
- 增加大数据批处理、流处理的实践经验 | ||
- 增加数据挖掘的实践经验 | ||
|
||
## list | ||
|
||
| 主题 | 技术栈 | 数据集下载 | | ||
| ------------ | ------------ | ------------ | | ||
| [7000 条租房数据分析](https://github.com/TurboWay/bigdata_analyse/blob/master/RentFromDanke) | 清洗 pandas + 分析 sqlite | [百度网盘](https://pan.baidu.com/s/1l1x5qurJdkyUxAuhknj_Qw) 提取码:9en3 | | ||
| [300 万条《野蛮时代》的玩家数据分析](https://github.com/TurboWay/bigdata_analyse/blob/master/AgeOfBarbarians) | 清洗 hive + 分析 hive | [百度网盘](https://pan.baidu.com/s/1Mi5lvGDF405Nk8Y2BZDzdQ) 提取码:paq4 | | ||
| [1000 万条淘宝用户行为数据实时分析](https://github.com/TurboWay/bigdata_analyse/blob/master/UserBehaviorFromTaobao_Stream/用户行为数据实时分析.md) | 数据源 kafka + 实时分析 flink + 可视化(es + kibana) | [百度网盘](https://pan.baidu.com/s/1wDVQpRV7giIlLJJgRZAInQ) 提取码:gja5 | | ||
| [1 亿条淘宝用户行为数据分析](https://github.com/TurboWay/bigdata_analyse/blob/master/UserBehaviorFromTaobao_Batch) | 清洗 hive + 分析 hive | [阿里云](https://tianchi.aliyun.com/dataset/dataDetail?dataId=649&userId=1) 或者 [百度网盘](https://pan.baidu.com/s/15Ss-nDMA120EHhuwpzYm0g) 提取码:5ipq | | ||
|
||
|
||
## todo | ||
|
||
对每个项目增加可视化说明 md |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
-- 1.整体情况(出租房数量,每平米租金) | ||
select count(1) as total, -- 出租房总数量 | ||
sum(价格)/sum(面积) as per -- 平均每平米租金 | ||
from rent | ||
|
||
-- 2.地区分析 | ||
select 位置1, count(1) as total, count(distinct 小区) as com, sum(价格)/sum(面积) as per | ||
from rent | ||
group by 位置1 | ||
order by total desc | ||
|
||
-- 3.小区分析 | ||
select 小区, 位置1, count(1) as total, sum(价格)/sum(面积) as per | ||
from rent | ||
group by 小区, 位置1 | ||
order by total desc | ||
|
||
|
||
-- 4.户型楼层分析 | ||
--户型 | ||
select 户型, count(1) as total, sum(价格)/sum(面积) as per | ||
from rent | ||
group by 户型 | ||
order by total desc | ||
|
||
--电梯 | ||
select case when 总楼层 > 7 then '电梯房' else '非电梯房' end as tp, count(1) as total, sum(价格)/sum(面积) as per | ||
from rent | ||
group by case when 总楼层 > 7 then '电梯房' else '非电梯房' end | ||
order by total desc | ||
|
||
-- 所在楼层 | ||
select case when 1.0 * 所在楼层/总楼层 > 0.66 then '高层' | ||
when 1.0 * 所在楼层/总楼层 > 0.33 then '中层' | ||
else '底层' end as tp, | ||
count(1) as total, sum(价格)/sum(面积) as per | ||
from rent | ||
group by case when 1.0 * 所在楼层/总楼层 > 0.66 then '高层' | ||
when 1.0 * 所在楼层/总楼层 > 0.33 then '中层' | ||
else '底层' end | ||
order by total desc | ||
|
||
-- 电梯&所在楼层 | ||
select case when 总楼层 > 7 then '电梯房' | ||
else '非电梯房' end as tp1, | ||
case when 1.0 * 所在楼层/总楼层 > 0.66 then '高层' | ||
when 1.0 * 所在楼层/总楼层 > 0.33 then '中层' | ||
else '底层' end as tp2, | ||
count(1) as total, sum(价格)/sum(面积) as per | ||
from rent | ||
group by case when 总楼层 > 7 then '电梯房' | ||
else '非电梯房' end, | ||
case when 1.0 * 所在楼层/总楼层 > 0.66 then '高层' | ||
when 1.0 * 所在楼层/总楼层 > 0.33 then '中层' | ||
else '底层' end | ||
order by 1, 2 desc | ||
|
||
-- 5.交通分析 | ||
|
||
--地铁数 | ||
select 地铁数, count(1) as total, sum(价格)/sum(面积) as per | ||
from rent | ||
group by 地铁数 | ||
order by 1 | ||
|
||
--距离地铁距离 | ||
select case when 距离地铁距离 between 0 and 500 then '500米以内' | ||
when 距离地铁距离 between 501 and 1000 then '1公里以内' | ||
when 距离地铁距离 between 1001 and 1500 then '1.5公里以内' | ||
else '1.5公里以外' end as ds, | ||
count(1) as total, sum(价格)/sum(面积) as per | ||
from rent | ||
group by case when 距离地铁距离 between 0 and 500 then '500米以内' | ||
when 距离地铁距离 between 501 and 1000 then '1公里以内' | ||
when 距离地铁距离 between 1001 and 1500 then '1.5公里以内' | ||
else '1.5公里以外' end | ||
order by 1 |
Oops, something went wrong.