Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
way committed Dec 25, 2020
1 parent 57a196c commit 61920cc
Show file tree
Hide file tree
Showing 12 changed files with 6,056 additions and 1 deletion.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/.idea/
98 changes: 98 additions & 0 deletions AgeOfBarbarians/analyse.hql
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
--1.新增用户分析

-- 新增用户总量
select count(1) from age_of_barbarians; --3116941

-- PU ( Paying Users):付费用户总量
select count(1) from age_of_barbarians where pay_price > 0; --60988

-- DNU(Daily New Users): 每日游戏中的新登入用户数量,即每日新用户数。
```
点击:点击广告页或者点击广告链接数
下载:点击后成功下载用户数
安装:下载程序并成功安装用户数
激活:成功安装并首次激活应用程序
注册:产生user_id
DNU:产生user_id并且首次登陆
```
select cast(register_time as date) as day,
count(1) as dnu,
sum(case when pay_price > 0 then 1 else 0 end ) as dnpu
from age_of_barbarians
group by cast(register_time as date)
order by day;

-- 每小时的新登入用户数量
select hour(cast(register_time as datetime)) as hour,
count(1) as dnu,
sum(case when pay_price > 0 then 1 else 0 end ) as dnpu
from age_of_barbarians
group by hour(cast(register_time as datetime))
order by hour;


--2.用户活跃度分析

-- DAU、WAU、MAU(Daily Active Users、Weekly Active Users、Monthly Active Users):每日、每周、每月登陆游戏的用户数,一般为自然周与自然月。

-- 平均在线时长
select avg(avg_online_minutes) from age_of_barbarians; --10.615346021266106

-- 付费玩家的平均在线时长
select avg(avg_online_minutes) from age_of_barbarians where pay_price > 0; --138.80478126869235


--3.玩家付费情况分析

-- APA(Active Payment Account):活跃付费用户数。
select count(1) as APA from age_of_barbarians where pay_price > 0 and avg_online_minutes > 0; --60987

-- ARPU(Average Revenue Per User) :平均每用户收入。
select sum(pay_price)/sum(case when avg_online_minutes > 0 then 1 else 0 end) from age_of_barbarians; --0.5824066558640159

-- ARPPU (Average Revenue Per Paying User): 平均每付费用户收入。
select sum(pay_price)/sum(case when avg_online_minutes > 0 and pay_price > 0 then 1 else 0 end) from age_of_barbarians; --29.190265138469332

-- PUR(Pay User Rate):付费比率,可通过 APA/AU 计算得出。
select sum(case when avg_online_minutes > 0 and pay_price > 0 then 1 else 0 end) / sum(case when avg_online_minutes > 0 then 1 else 0 end)
from age_of_barbarians; --0.019952085159256484

-- 付费玩家人数,付费总额,付费总次数,平均每人付费,平均每人付费次数,平均每次付费
select count(1) as pu, --60988
sum(pay_price) as sum_pay_price, --1780226.6999998293
avg(pay_price) as avg_pay_price, --29.189786515377275
sum(pay_count) as sum_pay_count, --193030.0
avg(pay_count) as avg_pay_count, --3.165048862071227
sum(pay_price) / sum(pay_count) as each_pay_price --9.222538983576797
from age_of_barbarians
where pay_price > 0;


--4.玩家习惯分析

--胜率
select sum(pvp_win_count) / sum(pvp_battle_count) as pvp_win_rate, --玩家pvp胜率
sum(case when pay_price > 0 then pvp_win_count else 0 end) / sum(case when pay_price > 0 then pvp_battle_count else 0 end) as pve_win_rate_pay, --付费玩家pve胜率
sum(case when pay_price = 0 then pvp_win_count else 0 end) / sum(case when pay_price = 0 then pvp_battle_count else 0 end) as pve_win_rate_nor, --非付费玩家pve胜率
sum(pve_win_count) / sum(pve_battle_count) as pve_win_rate, --玩家pve胜率
sum(case when pay_price > 0 then pve_win_count else 0 end) / sum(case when pay_price > 0 then pve_battle_count else 0 end) as pve_win_rate_pay, --付费玩家pve胜率
sum(case when pay_price = 0 then pve_win_count else 0 end) / sum(case when pay_price = 0 then pve_battle_count else 0 end) as pve_win_rate_nor --非付费玩家pve胜率
from age_of_barbarians;

--pvp场次
select sum(pvp_battle_count) as pvp_battle_count, --玩家pvp场次
avg(pvp_battle_count) as pvp_battle_count_avg, --玩家平均pvp场次
sum(case when pay_price > 0 then pvp_battle_count else 0 end) as pvp_battle_count_pay, --付费玩家pvp场次
sum(case when pay_price > 0 then pvp_battle_count else 0 end) / sum(case when pay_price > 0 then 1 else 0 end), --付费玩家平均pvp场次
sum(case when pay_price = 0 then pvp_battle_count else 0 end) as pvp_battle_count_nor, --非付费玩家pvp场次
sum(case when pay_price = 0 then pvp_battle_count else 0 end) / sum(case when pay_price = 0 then 1 else 0 end) --非付费玩家平均pvp场次
from age_of_barbarians;

--pve场次
select sum(pve_battle_count) as pve_battle_count, --玩家pve场次
avg(pve_battle_count) as pve_battle_count_avg, --玩家平均pve场次
sum(case when pay_price > 0 then pve_battle_count else 0 end) as pve_battle_count_pay, --付费玩家pve场次
sum(case when pay_price > 0 then pve_battle_count else 0 end) / sum(case when pay_price > 0 then 1 else 0 end), --付费玩家平均pve场次
sum(case when pay_price = 0 then pve_battle_count else 0 end) as pve_battle_count_nor, --非付费玩家pvp场次
sum(case when pay_price = 0 then pve_battle_count else 0 end) / sum(case when pay_price = 0 then 1 else 0 end) --非付费玩家平均pve场次
from age_of_barbarians;
143 changes: 143 additions & 0 deletions AgeOfBarbarians/table.hql
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
/*
说明:手游《野蛮时代》玩家数据
来源:https://js.dclab.run/v2/cmptDetail.html?id=226
大小:642 M (训练集)+ 219 M(测试集)= 861 M
记录数:2,288,007 (训练集)+ 828,934(测试集)= 3,116,941
字段数:109
*/

-- 建表
drop table if exists age_of_barbarians;
create table age_of_barbarians (
`user_id` string comment '玩家唯一ID',
`register_time` string comment '玩家注册时间',
`wood_add_value` string comment '木头获取数量',
`wood_reduce_value` string comment '木头消耗数量',
`stone_add_value` string comment '石头获取数量',
`stone_reduce_value` string comment '石头消耗数量',
`ivory_add_value` string comment '象牙获取数量',
`ivory_reduce_value` string comment '象牙消耗数量',
`meat_add_value` string comment '肉获取数量',
`meat_reduce_value` string comment '肉消耗数量',
`magic_add_value` string comment '魔法获取数量',
`magic_reduce_value` string comment '魔法消耗数量',
`infantry_add_value` string comment '勇士招募数量',
`infantry_reduce_value` string comment '勇士损失数量',
`cavalry_add_value` string comment '驯兽师招募数量',
`cavalry_reduce_value` string comment '驯兽师损失数量',
`shaman_add_value` string comment '萨满招募数量',
`shaman_reduce_value` string comment '萨满损失数量',
`wound_infantry_add_value` string comment '勇士伤兵产生数量',
`wound_infantry_reduce_value` string comment '勇士伤兵恢复数量',
`wound_cavalry_add_value` string comment '驯兽师伤兵产生数量',
`wound_cavalry_reduce_value` string comment '驯兽师伤兵恢复数量',
`wound_shaman_add_value` string comment '萨满伤兵产生数量',
`wound_shaman_reduce_value` string comment '萨满伤兵恢复数量',
`general_acceleration_add_value` string comment '通用加速获取数量',
`general_acceleration_reduce_value` string comment '通用加速使用数量',
`building_acceleration_add_value` string comment '建筑加速获取数量',
`building_acceleration_reduce_value` string comment '建筑加速使用数量',
`reaserch_acceleration_add_value` string comment '科研加速获取数量',
`reaserch_acceleration_reduce_value` string comment '科研加速使用数量',
`training_acceleration_add_value` string comment '训练加速获取数量',
`training_acceleration_reduce_value` string comment '训练加速使用数量',
`treatment_acceleraion_add_value` string comment '治疗加速获取数量',
`treatment_acceleration_reduce_value` string comment '治疗加速使用数量',
`bd_training_hut_level` string comment '建筑:士兵小屋等级',
`bd_healing_lodge_level` string comment '建筑:治疗小井等级',
`bd_stronghold_level` string comment '建筑:要塞等级',
`bd_outpost_portal_level` string comment '建筑:据点传送门等级',
`bd_barrack_level` string comment '建筑:兵营等级',
`bd_healing_spring_level` string comment '建筑:治疗之泉等级',
`bd_dolmen_level` string comment '建筑:智慧神庙等级',
`bd_guest_cavern_level` string comment '建筑:联盟大厅等级',
`bd_warehouse_level` string comment '建筑:仓库等级',
`bd_watchtower_level` string comment '建筑:瞭望塔等级',
`bd_magic_coin_tree_level` string comment '建筑:魔法幸运树等级',
`bd_hall_of_war_level` string comment '建筑:战争大厅等级',
`bd_market_level` string comment '建筑:联盟货车等级',
`bd_hero_gacha_level` string comment '建筑:占卜台等级',
`bd_hero_strengthen_level` string comment '建筑:祭坛等级',
`bd_hero_pve_level` string comment '建筑:冒险传送门等级',
`sr_scout_level` string comment '科研:侦查等级',
`sr_training_speed_level` string comment '科研:训练速度等级',
`sr_infantry_tier_2_level` string comment '科研:守护者',
`sr_cavalry_tier_2_level` string comment '科研:巨兽驯兽师',
`sr_shaman_tier_2_level` string comment '科研:吟唱者',
`sr_infantry_atk_level` string comment '科研:勇士攻击',
`sr_cavalry_atk_level` string comment '科研:驯兽师攻击',
`sr_shaman_atk_level` string comment '科研:萨满攻击',
`sr_infantry_tier_3_level` string comment '科研:战斗大师',
`sr_cavalry_tier_3_level` string comment '科研:高阶巨兽骑兵',
`sr_shaman_tier_3_level` string comment '科研:图腾大师',
`sr_troop_defense_level` string comment '科研:部队防御',
`sr_infantry_def_level` string comment '科研:勇士防御',
`sr_cavalry_def_level` string comment '科研:驯兽师防御',
`sr_shaman_def_level` string comment '科研:萨满防御',
`sr_infantry_hp_level` string comment '科研:勇士生命',
`sr_cavalry_hp_level` string comment '科研:驯兽师生命',
`sr_shaman_hp_level` string comment '科研:萨满生命',
`sr_infantry_tier_4_level` string comment '科研:狂战士',
`sr_cavalry_tier_4_level` string comment '科研:龙骑兵',
`sr_shaman_tier_4_level` string comment '科研:神谕者',
`sr_troop_attack_level` string comment '科研:部队攻击',
`sr_construction_speed_level` string comment '科研:建造速度',
`sr_hide_storage_level` string comment '科研:资源保护',
`sr_troop_consumption_level` string comment '科研:部队消耗',
`sr_rss_a_prod_levell` string comment '科研:木材生产',
`sr_rss_b_prod_level` string comment '科研:石头生产',
`sr_rss_c_prod_level` string comment '科研:象牙生产',
`sr_rss_d_prod_level` string comment '科研:肉类生产',
`sr_rss_a_gather_level` string comment '科研:木材采集',
`sr_rss_b_gather_level` string comment '科研:石头采集',
`sr_rss_c_gather_level` string comment '科研:象牙采集',
`sr_rss_d_gather_level` string comment '科研:肉类生产',
`sr_troop_load_level` string comment '科研:部队负重',
`sr_rss_e_gather_level` string comment '科研:魔法采集',
`sr_rss_e_prod_level` string comment '科研:魔法生产',
`sr_outpost_durability_level` string comment '科研:据点耐久',
`sr_outpost_tier_2_level` string comment '科研:据点二',
`sr_healing_space_level` string comment '科研:医院容量',
`sr_gathering_hunter_buff_level` string comment '科研:领土采集奖励',
`sr_healing_speed_level` string comment '科研:治疗速度',
`sr_outpost_tier_3_level` string comment '科研:据点三',
`sr_alliance_march_speed_level` string comment '科研:联盟行军速度',
`sr_pvp_march_speed_level` string comment '科研:战斗行军速度',
`sr_gathering_march_speed_level` string comment '科研:采集行军速度',
`sr_outpost_tier_4_level` string comment '科研:据点四',
`sr_guest_troop_capacity_level` string comment '科研:增援部队容量',
`sr_march_size_level` string comment '科研:行军大小',
`sr_rss_help_bonus_level` string comment '科研:资源帮助容量',
`pvp_battle_count` string comment 'PVP次数',
`pvp_lanch_count` string comment '主动发起PVP次数',
`pvp_win_count` string comment 'PVP胜利次数',
`pve_battle_count` string comment 'PVE次数',
`pve_lanch_count` string comment '主动发起PVE次数',
`pve_win_count` string comment 'PVE胜利次数',
`avg_online_minutes` string comment '在线时长',
`pay_price` string comment '付费金额',
`pay_count` string comment '付费次数',
`prediction_pay_price` string comment '45日付费金额' )
row format delimited
fields terminated by ','
lines terminated by '\n';

-- 加载数据
LOAD DATA LOCAL INPATH '/home/getway/tap_fun_test.csv'
INTO TABLE age_of_barbarians ;
LOAD DATA LOCAL INPATH '/home/getway/tap_fun_train.csv'
INTO TABLE age_of_barbarians ;

-- 去掉 csv 的标题数据
insert overwrite table age_of_barbarians
select * from age_of_barbarians
where user_id <> 'user_id';

-- 查看数据重复性(基于 user_id 唯一,无重复记录)
select count(1), count(distinct user_id)
from age_of_barbarians ;

-- 查看基本数据样例
select user_id, register_time, avg_online_minutes , pay_price, pay_count
from age_of_barbarians
limit 100 ;
25 changes: 24 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,25 @@
# bigdata_analyse
大数据分析
大数据分析项目

## wish

通过对不同行业的数据集进行分析,期望达到以下目标:

- 了解不同领域的业务分析指标
- 提高数据处理、数据分析能力
- 增加大数据批处理、流处理的实践经验
- 增加数据挖掘的实践经验

## list

| 主题 | 技术栈 | 数据集下载 |
| ------------ | ------------ | ------------ |
| [7000 条租房数据分析](https://github.com/TurboWay/bigdata_analyse/blob/master/RentFromDanke) | 清洗 pandas + 分析 sqlite | [百度网盘](https://pan.baidu.com/s/1l1x5qurJdkyUxAuhknj_Qw) 提取码:9en3 |
| [300 万条《野蛮时代》的玩家数据分析](https://github.com/TurboWay/bigdata_analyse/blob/master/AgeOfBarbarians) | 清洗 hive + 分析 hive | [百度网盘](https://pan.baidu.com/s/1Mi5lvGDF405Nk8Y2BZDzdQ) 提取码:paq4 |
| [1000 万条淘宝用户行为数据实时分析](https://github.com/TurboWay/bigdata_analyse/blob/master/UserBehaviorFromTaobao_Stream/用户行为数据实时分析.md) | 数据源 kafka + 实时分析 flink + 可视化(es + kibana) | [百度网盘](https://pan.baidu.com/s/1wDVQpRV7giIlLJJgRZAInQ) 提取码:gja5 |
| [1 亿条淘宝用户行为数据分析](https://github.com/TurboWay/bigdata_analyse/blob/master/UserBehaviorFromTaobao_Batch) | 清洗 hive + 分析 hive | [阿里云](https://tianchi.aliyun.com/dataset/dataDetail?dataId=649&userId=1) 或者 [百度网盘](https://pan.baidu.com/s/15Ss-nDMA120EHhuwpzYm0g) 提取码:5ipq |


## todo

对每个项目增加可视化说明 md
77 changes: 77 additions & 0 deletions RentFromDanke/analyse.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
-- 1.整体情况(出租房数量,每平米租金)
select count(1) as total, -- 出租房总数量
sum(价格)/sum(面积) as per -- 平均每平米租金
from rent

-- 2.地区分析
select 位置1, count(1) as total, count(distinct 小区) as com, sum(价格)/sum(面积) as per
from rent
group by 位置1
order by total desc

-- 3.小区分析
select 小区, 位置1, count(1) as total, sum(价格)/sum(面积) as per
from rent
group by 小区, 位置1
order by total desc


-- 4.户型楼层分析
--户型
select 户型, count(1) as total, sum(价格)/sum(面积) as per
from rent
group by 户型
order by total desc

--电梯
select case when 总楼层 > 7 then '电梯房' else '非电梯房' end as tp, count(1) as total, sum(价格)/sum(面积) as per
from rent
group by case when 总楼层 > 7 then '电梯房' else '非电梯房' end
order by total desc

-- 所在楼层
select case when 1.0 * 所在楼层/总楼层 > 0.66 then '高层'
when 1.0 * 所在楼层/总楼层 > 0.33 then '中层'
else '底层' end as tp,
count(1) as total, sum(价格)/sum(面积) as per
from rent
group by case when 1.0 * 所在楼层/总楼层 > 0.66 then '高层'
when 1.0 * 所在楼层/总楼层 > 0.33 then '中层'
else '底层' end
order by total desc

-- 电梯&所在楼层
select case when 总楼层 > 7 then '电梯房'
else '非电梯房' end as tp1,
case when 1.0 * 所在楼层/总楼层 > 0.66 then '高层'
when 1.0 * 所在楼层/总楼层 > 0.33 then '中层'
else '底层' end as tp2,
count(1) as total, sum(价格)/sum(面积) as per
from rent
group by case when 总楼层 > 7 then '电梯房'
else '非电梯房' end,
case when 1.0 * 所在楼层/总楼层 > 0.66 then '高层'
when 1.0 * 所在楼层/总楼层 > 0.33 then '中层'
else '底层' end
order by 1, 2 desc

-- 5.交通分析

--地铁数
select 地铁数, count(1) as total, sum(价格)/sum(面积) as per
from rent
group by 地铁数
order by 1

--距离地铁距离
select case when 距离地铁距离 between 0 and 500 then '500米以内'
when 距离地铁距离 between 501 and 1000 then '1公里以内'
when 距离地铁距离 between 1001 and 1500 then '1.5公里以内'
else '1.5公里以外' end as ds,
count(1) as total, sum(价格)/sum(面积) as per
from rent
group by case when 距离地铁距离 between 0 and 500 then '500米以内'
when 距离地铁距离 between 501 and 1000 then '1公里以内'
when 距离地铁距离 between 1001 and 1500 then '1.5公里以内'
else '1.5公里以外' end
order by 1
Loading

0 comments on commit 61920cc

Please sign in to comment.