- Como criar estatísticas descritivas para a sua análise
- Porcentagems
- Correlações
- Testes Estatísticos
- Formatação de Tabelas
Summarize
funciona com muitas estatísticasflights %>% summarize(mean_distance=mean(distance))
## [1] 1039.913
flights %>% summarize(median_distance=median(distance))
## [1] 872
flights %>% summarize(sd_distance=sd(distance))
## [1] 733.233
flights %>% summarize(pct_25_distance=quantile(distance,0.25))
## 25% ## 502
na.rm=TRUE
para tirar linhas 'missing'flights <- flights %>% mutate(distance=ifelse(carrier=="AA",NA,distance)) flights %>% summarize(mean_distance=mean(distance))
## [1] NA
flights2 %>% summarize(mean_distance=mean(distance,na.rm=TRUE))
## [1] 1007.584
summarise_at
e especifique um vetor dos nomes dos variáveisvars <- c("distance","sched_dep_time","sched_arr_time") flights %>% summarize_at(vars,mean)
distance | sched_dep_time | sched_arr_time |
---|---|---|
1039.913 | 1344.255 | 1536.38 |
summarise_at
e especifique um vetor dos nomes dos variáveisvars <- c("distance","sched_dep_time","sched_arr_time") flights %>% summarize_at(vars,mean,na.rm=TRUE)
distance | sched_dep_time | sched_arr_time |
---|---|---|
1039.913 | 1344.255 | 1536.38 |
flights %>% group_by(origin) %>% summarize(mean_distance=mean(distance))
origin | mean_distance |
---|---|
EWR | 1056.7428 |
JFK | 1266.2491 |
LGA | 779.8357 |
counts
por grupo, depois ungroup
e calcule as proporçõesflights %>% group_by(carrier) %>% count() %>% ungroup() %>% mutate(Percentage=100*(n/sum(n)))
carrier | n | Percentage |
---|---|---|
UA | 58665 | 17.419591 |
B6 | 54635 | 16.222949 |
EV | 54173 | 16.085766 |
DL | 48110 | 14.285460 |
AA | 32729 | 9.718329 |
ungroup
com um group_by
para a variável do sub-grupoflights %>% group_by(carrier, origin) %>% count() %>% group_by(origin) %>% mutate(Percentage_of_origin=round(100*(n/sum(n)),1))
carrier | origin | n | Percentage_of_origin |
---|---|---|---|
OO | EWR | 6 | 0.0 |
AS | EWR | 714 | 0.6 |
9E | EWR | 1268 | 1.0 |
VX | EWR | 1566 | 1.3 |
MQ | EWR | 2276 | 1.9 |
HA | JFK | 342 | 0.3 |
EV | JFK | 1408 | 1.3 |
US | JFK | 2995 | 2.7 |
VX | JFK | 3596 | 3.2 |
UA | JFK | 4534 | 4.1 |
OO | LGA | 26 | 0.0 |
YV | LGA | 601 | 0.6 |
F9 | LGA | 685 | 0.7 |
9E | LGA | 2541 | 2.4 |
FL | LGA | 3260 | 3.1 |
flights %>% group_by(carrier, origin) %>% count() %>% group_by(carrier) %>% mutate(Percentage_of_airline=round(100*(n/sum(n)),1))
carrier | origin | n | Percentage_of_airline |
---|---|---|---|
9E | EWR | 1268 | 6.9 |
9E | JFK | 14651 | 79.4 |
9E | LGA | 2541 | 13.8 |
AA | EWR | 3487 | 10.7 |
AA | JFK | 13783 | 42.1 |
AA | LGA | 15459 | 47.2 |
AS | EWR | 714 | 100.0 |
B6 | EWR | 6557 | 12.0 |
B6 | JFK | 42076 | 77.0 |
B6 | LGA | 6002 | 11.0 |
DL | EWR | 4342 | 9.0 |
DL | JFK | 20701 | 43.0 |
DL | LGA | 23067 | 47.9 |
EV | EWR | 43939 | 81.1 |
EV | JFK | 1408 | 2.6 |
EV | LGA | 8826 | 16.3 |
F9 | LGA | 685 | 100.0 |
FL | LGA | 3260 | 100.0 |
HA | JFK | 342 | 100.0 |
MQ | EWR | 2276 | 8.6 |
MQ | JFK | 7193 | 27.2 |
MQ | LGA | 16928 | 64.1 |
OO | EWR | 6 | 18.8 |
OO | LGA | 26 | 81.2 |
UA | EWR | 46087 | 78.6 |
UA | JFK | 4534 | 7.7 |
UA | LGA | 8044 | 13.7 |
US | EWR | 4405 | 21.5 |
US | JFK | 2995 | 14.6 |
US | LGA | 13136 | 64.0 |
VX | EWR | 1566 | 30.3 |
VX | JFK | 3596 | 69.7 |
WN | EWR | 6188 | 50.4 |
WN | LGA | 6087 | 49.6 |
YV | LGA | 601 | 100.0 |
flights %>% summarize(cor=cor(dep_time,dep_delay))
cor |
---|
NA |
flights %>% summarize(cor=cor(dep_time,dep_delay,use="pairwise.complete.obs"))
cor |
---|
0.2602312 |
t.test(outcome_variable ~ grouping_variable, data=.)
flights %>% filter(origin %in% c("JFK","EWR")) %>% t.test(dep_delay ~ origin, data=.)
## ## Welch Two Sample t-test ## ## data: dep_delay by origin ## t = 17.762, df = 226960, p-value < 2.2e-16 ## alternative hypothesis: true difference in means is not equal to 0 ## 95 percent confidence interval: ## 2.665219 3.326372 ## sample estimates: ## mean in group EWR mean in group JFK ## 15.10795 12.11216
broom
tidy()
para criar um data.frame bonitolibrary(broom) flights %>% filter(origin %in% c("JFK","EWR")) %>% t.test(dep_delay ~ origin, data=.) %>% tidy()
estimate | estimate1 | estimate2 | statistic | p.value | parameter | conf.low | conf.high | method | alternative |
---|---|---|---|---|---|---|---|---|---|
2.995795 | 15.10795 | 12.11216 | 17.76196 | 0 | 226958.1 | 2.665219 | 3.326372 | Welch Two Sample t-test | two.sided |
count()
para cada categoria (aeroporto)flights %>% filter(origin %in% c("JFK","EWR")) %>% group_by(origin) %>% count() %>% ungroup() %>% pull(n) %>% as.table() %>% prop.test()
## ## 1-sample proportions test with continuity correction ## ## data: ., null probability 0.5 ## X-squared = 393.33, df = 1, p-value < 2.2e-16 ## alternative hypothesis: true p is not equal to 0.5 ## 95 percent confidence interval: ## 0.5185499 0.5226189 ## sample estimates: ## p ## 0.5205847
outcome_vars <- c("dep_delay","arr_delay","month") flights %>% filter(origin %in% c("JFK","EWR")) %>% summarise_at(outcome_vars,funs(t.test(.[origin=="JFK"], .[origin=="EWR"])$p.value))
dep_delay | arr_delay | month |
---|---|---|
0 | 0 | 0.758 |
kable
é ótimo para estatísticas descritivascaption
align
texto dentro de colunasdigits
col.names
format.args
de númerostable_eg <- flights %>% slice(1:5) %>% select(origin, dest, distance, air_time) kable(table_eg)
origin | dest | distance | air_time |
---|---|---|---|
EWR | IAH | 1400 | 227 |
LGA | IAH | 1416 | 227 |
JFK | MIA | 1089 | 160 |
JFK | BQN | 1576 | 183 |
LGA | ATL | 762 | 116 |
table_eg <- flights %>% slice(1:5) %>% select(origin, dest, distance, air_time) kable(table_eg,caption="Sample Data")
origin | dest | distance | air_time |
---|---|---|---|
EWR | IAH | 1400 | 227 |
LGA | IAH | 1416 | 227 |
JFK | MIA | 1089 | 160 |
JFK | BQN | 1576 | 183 |
LGA | ATL | 762 | 116 |
table_eg <- flights %>% slice(1:5) %>% select(origin, dest, distance, air_time) kable(table_eg,caption="Sample Data",align="ccrr")
origin | dest | distance | air_time |
---|---|---|---|
EWR | IAH | 1400 | 227 |
LGA | IAH | 1416 | 227 |
JFK | MIA | 1089 | 160 |
JFK | BQN | 1576 | 183 |
LGA | ATL | 762 | 116 |
table_eg <- flights %>% slice(1:5) %>% select(origin, dest, distance, air_time) kable(table_eg,caption="Sample Data",align="ccrr",digits=1)
origin | dest | distance | air_time |
---|---|---|---|
EWR | IAH | 1400 | 227 |
LGA | IAH | 1416 | 227 |
JFK | MIA | 1089 | 160 |
JFK | BQN | 1576 | 183 |
LGA | ATL | 762 | 116 |
table_eg <- flights %>% slice(1:5) %>% select(origin, dest, distance, air_time) kable(table_eg,caption="Sample Data",align="ccrr",digits=1, col.names=c("Take Off","Landing","Distance","Duration"))
Take Off | Landing | Distance | Duration |
---|---|---|---|
EWR | IAH | 1400 | 227 |
LGA | IAH | 1416 | 227 |
JFK | MIA | 1089 | 160 |
JFK | BQN | 1576 | 183 |
LGA | ATL | 762 | 116 |
table_eg <- flights %>% slice(1:5) %>% select(origin, dest, distance, air_time) kable(table_eg,caption="Sample Data",align="ccrr",digits=1, col.names=c("Take Off","Landing","Distance","Duration"),format.args=list(big.mark=','))
Take Off | Landing | Distance | Duration |
---|---|---|---|
EWR | IAH | 1,400 | 227 |
LGA | IAH | 1,416 | 227 |
JFK | MIA | 1,089 | 160 |
JFK | BQN | 1,576 | 183 |
LGA | ATL | 762 | 116 |