- Como criar estatísticas descritivas para a sua análise
- Porcentagems
- Correlações
- Testes Estatísticos
- Formatação de Tabelas
Summarize funciona com muitas estatísticasflights %>% summarize(mean_distance=mean(distance))
## [1] 1039.913
flights %>% summarize(median_distance=median(distance))
## [1] 872
flights %>% summarize(sd_distance=sd(distance))
## [1] 733.233
flights %>% summarize(pct_25_distance=quantile(distance,0.25))
## 25% ## 502
na.rm=TRUE para tirar linhas 'missing'flights <- flights %>% mutate(distance=ifelse(carrier=="AA",NA,distance)) flights %>% summarize(mean_distance=mean(distance))
## [1] NA
flights2 %>% summarize(mean_distance=mean(distance,na.rm=TRUE))
## [1] 1007.584
summarise_at e especifique um vetor dos nomes dos variáveisvars <- c("distance","sched_dep_time","sched_arr_time")
flights %>% summarize_at(vars,mean)
| distance | sched_dep_time | sched_arr_time |
|---|---|---|
| 1039.913 | 1344.255 | 1536.38 |
summarise_at e especifique um vetor dos nomes dos variáveisvars <- c("distance","sched_dep_time","sched_arr_time")
flights %>% summarize_at(vars,mean,na.rm=TRUE)
| distance | sched_dep_time | sched_arr_time |
|---|---|---|
| 1039.913 | 1344.255 | 1536.38 |
flights %>% group_by(origin) %>% summarize(mean_distance=mean(distance))
| origin | mean_distance |
|---|---|
| EWR | 1056.7428 |
| JFK | 1266.2491 |
| LGA | 779.8357 |
counts por grupo, depois ungroup e calcule as proporçõesflights %>% group_by(carrier) %>% count() %>% ungroup() %>% mutate(Percentage=100*(n/sum(n)))
| carrier | n | Percentage |
|---|---|---|
| UA | 58665 | 17.419591 |
| B6 | 54635 | 16.222949 |
| EV | 54173 | 16.085766 |
| DL | 48110 | 14.285460 |
| AA | 32729 | 9.718329 |
ungroup com um group_by para a variável do sub-grupoflights %>% group_by(carrier, origin) %>% count() %>% group_by(origin) %>% mutate(Percentage_of_origin=round(100*(n/sum(n)),1))
| carrier | origin | n | Percentage_of_origin |
|---|---|---|---|
| OO | EWR | 6 | 0.0 |
| AS | EWR | 714 | 0.6 |
| 9E | EWR | 1268 | 1.0 |
| VX | EWR | 1566 | 1.3 |
| MQ | EWR | 2276 | 1.9 |
| HA | JFK | 342 | 0.3 |
| EV | JFK | 1408 | 1.3 |
| US | JFK | 2995 | 2.7 |
| VX | JFK | 3596 | 3.2 |
| UA | JFK | 4534 | 4.1 |
| OO | LGA | 26 | 0.0 |
| YV | LGA | 601 | 0.6 |
| F9 | LGA | 685 | 0.7 |
| 9E | LGA | 2541 | 2.4 |
| FL | LGA | 3260 | 3.1 |
flights %>% group_by(carrier, origin) %>% count() %>% group_by(carrier) %>% mutate(Percentage_of_airline=round(100*(n/sum(n)),1))
| carrier | origin | n | Percentage_of_airline |
|---|---|---|---|
| 9E | EWR | 1268 | 6.9 |
| 9E | JFK | 14651 | 79.4 |
| 9E | LGA | 2541 | 13.8 |
| AA | EWR | 3487 | 10.7 |
| AA | JFK | 13783 | 42.1 |
| AA | LGA | 15459 | 47.2 |
| AS | EWR | 714 | 100.0 |
| B6 | EWR | 6557 | 12.0 |
| B6 | JFK | 42076 | 77.0 |
| B6 | LGA | 6002 | 11.0 |
| DL | EWR | 4342 | 9.0 |
| DL | JFK | 20701 | 43.0 |
| DL | LGA | 23067 | 47.9 |
| EV | EWR | 43939 | 81.1 |
| EV | JFK | 1408 | 2.6 |
| EV | LGA | 8826 | 16.3 |
| F9 | LGA | 685 | 100.0 |
| FL | LGA | 3260 | 100.0 |
| HA | JFK | 342 | 100.0 |
| MQ | EWR | 2276 | 8.6 |
| MQ | JFK | 7193 | 27.2 |
| MQ | LGA | 16928 | 64.1 |
| OO | EWR | 6 | 18.8 |
| OO | LGA | 26 | 81.2 |
| UA | EWR | 46087 | 78.6 |
| UA | JFK | 4534 | 7.7 |
| UA | LGA | 8044 | 13.7 |
| US | EWR | 4405 | 21.5 |
| US | JFK | 2995 | 14.6 |
| US | LGA | 13136 | 64.0 |
| VX | EWR | 1566 | 30.3 |
| VX | JFK | 3596 | 69.7 |
| WN | EWR | 6188 | 50.4 |
| WN | LGA | 6087 | 49.6 |
| YV | LGA | 601 | 100.0 |
flights %>% summarize(cor=cor(dep_time,dep_delay))
| cor |
|---|
| NA |
flights %>% summarize(cor=cor(dep_time,dep_delay,use="pairwise.complete.obs"))
| cor |
|---|
| 0.2602312 |
t.test(outcome_variable ~ grouping_variable, data=.)flights %>% filter(origin %in% c("JFK","EWR")) %>%
t.test(dep_delay ~ origin, data=.)
## ## Welch Two Sample t-test ## ## data: dep_delay by origin ## t = 17.762, df = 226960, p-value < 2.2e-16 ## alternative hypothesis: true difference in means is not equal to 0 ## 95 percent confidence interval: ## 2.665219 3.326372 ## sample estimates: ## mean in group EWR mean in group JFK ## 15.10795 12.11216
broomtidy() para criar um data.frame bonitolibrary(broom)
flights %>% filter(origin %in% c("JFK","EWR")) %>%
t.test(dep_delay ~ origin, data=.) %>%
tidy()
| estimate | estimate1 | estimate2 | statistic | p.value | parameter | conf.low | conf.high | method | alternative |
|---|---|---|---|---|---|---|---|---|---|
| 2.995795 | 15.10795 | 12.11216 | 17.76196 | 0 | 226958.1 | 2.665219 | 3.326372 | Welch Two Sample t-test | two.sided |
count() para cada categoria (aeroporto)flights %>% filter(origin %in% c("JFK","EWR")) %>%
group_by(origin) %>%
count() %>%
ungroup() %>%
pull(n) %>%
as.table() %>%
prop.test()
## ## 1-sample proportions test with continuity correction ## ## data: ., null probability 0.5 ## X-squared = 393.33, df = 1, p-value < 2.2e-16 ## alternative hypothesis: true p is not equal to 0.5 ## 95 percent confidence interval: ## 0.5185499 0.5226189 ## sample estimates: ## p ## 0.5205847
outcome_vars <- c("dep_delay","arr_delay","month")
flights %>% filter(origin %in% c("JFK","EWR")) %>%
summarise_at(outcome_vars,funs(t.test(.[origin=="JFK"], .[origin=="EWR"])$p.value))
| dep_delay | arr_delay | month |
|---|---|---|
| 0 | 0 | 0.758 |
kable é ótimo para estatísticas descritivascaptionalign texto dentro de colunasdigitscol.namesformat.args de númerostable_eg <- flights %>% slice(1:5) %>% select(origin, dest, distance, air_time) kable(table_eg)
| origin | dest | distance | air_time |
|---|---|---|---|
| EWR | IAH | 1400 | 227 |
| LGA | IAH | 1416 | 227 |
| JFK | MIA | 1089 | 160 |
| JFK | BQN | 1576 | 183 |
| LGA | ATL | 762 | 116 |
table_eg <- flights %>% slice(1:5) %>% select(origin, dest, distance, air_time) kable(table_eg,caption="Sample Data")
| origin | dest | distance | air_time |
|---|---|---|---|
| EWR | IAH | 1400 | 227 |
| LGA | IAH | 1416 | 227 |
| JFK | MIA | 1089 | 160 |
| JFK | BQN | 1576 | 183 |
| LGA | ATL | 762 | 116 |
table_eg <- flights %>% slice(1:5) %>% select(origin, dest, distance, air_time) kable(table_eg,caption="Sample Data",align="ccrr")
| origin | dest | distance | air_time |
|---|---|---|---|
| EWR | IAH | 1400 | 227 |
| LGA | IAH | 1416 | 227 |
| JFK | MIA | 1089 | 160 |
| JFK | BQN | 1576 | 183 |
| LGA | ATL | 762 | 116 |
table_eg <- flights %>% slice(1:5) %>% select(origin, dest, distance, air_time) kable(table_eg,caption="Sample Data",align="ccrr",digits=1)
| origin | dest | distance | air_time |
|---|---|---|---|
| EWR | IAH | 1400 | 227 |
| LGA | IAH | 1416 | 227 |
| JFK | MIA | 1089 | 160 |
| JFK | BQN | 1576 | 183 |
| LGA | ATL | 762 | 116 |
table_eg <- flights %>% slice(1:5) %>% select(origin, dest, distance, air_time)
kable(table_eg,caption="Sample Data",align="ccrr",digits=1,
col.names=c("Take Off","Landing","Distance","Duration"))
| Take Off | Landing | Distance | Duration |
|---|---|---|---|
| EWR | IAH | 1400 | 227 |
| LGA | IAH | 1416 | 227 |
| JFK | MIA | 1089 | 160 |
| JFK | BQN | 1576 | 183 |
| LGA | ATL | 762 | 116 |
table_eg <- flights %>% slice(1:5) %>% select(origin, dest, distance, air_time)
kable(table_eg,caption="Sample Data",align="ccrr",digits=1,
col.names=c("Take Off","Landing","Distance","Duration"),format.args=list(big.mark=','))
| Take Off | Landing | Distance | Duration |
|---|---|---|---|
| EWR | IAH | 1,400 | 227 |
| LGA | IAH | 1,416 | 227 |
| JFK | MIA | 1,089 | 160 |
| JFK | BQN | 1,576 | 183 |
| LGA | ATL | 762 | 116 |