Assignment 1 – Knowledge Mining

############################################################
# 1. Required Packages
############################################################

library(haven)
Warning: package 'haven' was built under R version 4.4.3
library(tidyverse)
Warning: package 'tidyverse' was built under R version 4.4.3
Warning: package 'ggplot2' was built under R version 4.4.3
Warning: package 'tidyr' was built under R version 4.4.2
Warning: package 'dplyr' was built under R version 4.4.3
Warning: package 'lubridate' was built under R version 4.4.3
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   4.0.2     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
Warning: package 'janitor' was built under R version 4.4.3

Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test
library(skimr)
Warning: package 'skimr' was built under R version 4.4.3
############################################################
# 2. Data Import
############################################################

TEDS_2016 <- haven::read_dta(
"https://github.com/datageneration/home/blob/master/DataProgramming/data/TEDS_2016.dta?raw=true"
)

############################################################
# 3. Cleaning Missing Values
############################################################

TEDS_2016 <- TEDS_2016 %>%
  mutate(across(everything(), ~na_if(., 96))) %>%
  mutate(across(everything(), ~na_if(., 97))) %>%
  mutate(across(everything(), ~na_if(., 98))) %>%
  mutate(across(everything(), ~na_if(., 99)))

############################################################
# 4. Summary Statistics
############################################################

summary(TEDS_2016)
    District         Sex             Age           Edu            Arear      
 Min.   : 201   Min.   :1.000   Min.   :1.0   Min.   :1.000   Min.   :1.000  
 1st Qu.:1401   1st Qu.:1.000   1st Qu.:2.0   1st Qu.:2.000   1st Qu.:1.000  
 Median :6406   Median :1.000   Median :3.0   Median :3.000   Median :3.000  
 Mean   :4661   Mean   :1.486   Mean   :3.3   Mean   :3.334   Mean   :2.744  
 3rd Qu.:6604   3rd Qu.:2.000   3rd Qu.:5.0   3rd Qu.:5.000   3rd Qu.:4.000  
 Max.   :6806   Max.   :2.000   Max.   :5.0   Max.   :9.000   Max.   :6.000  
                                                                             
     Career         Career8          Ethnic          Party      
 Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   : 1.00  
 1st Qu.:1.000   1st Qu.:2.000   1st Qu.:1.000   1st Qu.: 5.00  
 Median :2.000   Median :4.000   Median :1.000   Median : 7.00  
 Mean   :2.683   Mean   :3.811   Mean   :1.658   Mean   :13.02  
 3rd Qu.:4.000   3rd Qu.:5.000   3rd Qu.:2.000   3rd Qu.:25.00  
 Max.   :5.000   Max.   :8.000   Max.   :9.000   Max.   :26.00  
                                                                
    PartyID          Tondu           Tondu3           nI2       
 Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
 1st Qu.:2.000   1st Qu.:3.000   1st Qu.:2.000   1st Qu.:1.000  
 Median :2.000   Median :4.000   Median :2.000   Median :2.000  
 Mean   :4.522   Mean   :4.127   Mean   :2.667   Mean   :1.846  
 3rd Qu.:9.000   3rd Qu.:5.000   3rd Qu.:3.000   3rd Qu.:3.000  
 Max.   :9.000   Max.   :9.000   Max.   :9.000   Max.   :3.000  
                                                 NA's   :585    
    votetsai          green         votetsai_nm      votetsai_all   
 Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
 1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
 Median :1.0000   Median :0.0000   Median :1.0000   Median :1.0000  
 Mean   :0.6265   Mean   :0.3781   Mean   :0.6265   Mean   :0.5478  
 3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
 Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
 NA's   :429                       NA's   :429      NA's   :248     
  Independence     Unification           sq           Taiwanese     
 Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
 1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
 Median :0.0000   Median :0.0000   Median :1.0000   Median :1.0000  
 Mean   :0.2888   Mean   :0.1225   Mean   :0.5172   Mean   :0.6272  
 3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
 Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
                                                                    
      edu            female        whitecollar       lowincome    
 Min.   :1.000   Min.   :0.0000   Min.   :0.0000   Min.   :1.000  
 1st Qu.:2.000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:4.000  
 Median :3.000   Median :0.0000   Median :1.0000   Median :5.000  
 Mean   :3.301   Mean   :0.4864   Mean   :0.5373   Mean   :4.343  
 3rd Qu.:5.000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:5.000  
 Max.   :5.000   Max.   :1.0000   Max.   :1.0000   Max.   :5.000  
 NA's   :10                                                       
     income         income_nm           age              KMT        
 Min.   : 1.000   Min.   : 1.000   Min.   : 20.00   Min.   :0.0000  
 1st Qu.: 3.000   1st Qu.: 2.000   1st Qu.: 35.00   1st Qu.:0.0000  
 Median : 5.500   Median : 5.000   Median : 49.00   Median :0.0000  
 Mean   : 5.324   Mean   : 5.281   Mean   : 49.11   Mean   :0.2296  
 3rd Qu.: 7.000   3rd Qu.: 8.000   3rd Qu.: 61.00   3rd Qu.:0.0000  
 Max.   :10.000   Max.   :10.000   Max.   :100.00   Max.   :1.0000  
                  NA's   :330                                       
      DPP              npp             noparty            pfp         
 Min.   :0.0000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
 1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000  
 Median :0.0000   Median :0.00000   Median :0.0000   Median :0.00000  
 Mean   :0.3497   Mean   :0.02544   Mean   :0.3716   Mean   :0.01893  
 3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:0.00000  
 Max.   :1.0000   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000  
                                                                      
     South            north        Minnan_father    Mainland_father 
 Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
 1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
 Median :0.0000   Median :0.0000   Median :1.0000   Median :0.0000  
 Mean   :0.4947   Mean   :0.4799   Mean   :0.7225   Mean   :0.1024  
 3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
 Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
                                                                    
   Econ_worse       Inequality      inequality5      econworse5   
 Min.   :0.0000   Min.   :0.0000   Min.   :1.000   Min.   :1.000  
 1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:4.000   1st Qu.:3.000  
 Median :1.0000   Median :1.0000   Median :5.000   Median :4.000  
 Mean   :0.5544   Mean   :0.9355   Mean   :4.495   Mean   :3.644  
 3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:5.000   3rd Qu.:4.000  
 Max.   :1.0000   Max.   :1.0000   Max.   :5.000   Max.   :5.000  
                                                                  
 Govt_for_public     pubwelf5     Govt_dont_care     highincome    
 Min.   :0.0000   Min.   :1.000   Min.   :0.0000   Min.   :0.0000  
 1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:0.0000   1st Qu.:0.0000  
 Median :0.0000   Median :3.000   Median :0.0000   Median :1.0000  
 Mean   :0.4249   Mean   :2.877   Mean   :0.4988   Mean   :0.5765  
 3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:1.0000   3rd Qu.:1.0000  
 Max.   :1.0000   Max.   :5.000   Max.   :1.0000   Max.   :1.0000  
                                                   NA's   :330     
    votekmt         votekmt_nm          Blue       Green      No_Party
 Min.   :0.0000   Min.   :0.0000   Min.   :0   Min.   :0   Min.   :0  
 1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0   1st Qu.:0   1st Qu.:0  
 Median :0.0000   Median :0.0000   Median :0   Median :0   Median :0  
 Mean   :0.2053   Mean   :0.2752   Mean   :0   Mean   :0   Mean   :0  
 3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0   3rd Qu.:0   3rd Qu.:0  
 Max.   :1.0000   Max.   :1.0000   Max.   :0   Max.   :0   Max.   :0  
                  NA's   :429                                         
    voteblue       voteblue_nm       votedpp_1        votekmt_1     
 Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
 1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
 Median :0.0000   Median :0.0000   Median :1.0000   Median :0.0000  
 Mean   :0.2787   Mean   :0.3735   Mean   :0.5256   Mean   :0.2309  
 3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
 Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
                  NA's   :429      NA's   :187      NA's   :187     
skim(TEDS_2016)
Warning in attr(x, "align"): 'xfun::attr()' is deprecated.
Use 'xfun::attr2()' instead.
See help("Deprecated")
Warning in attr(x, "align"): 'xfun::attr()' is deprecated.
Use 'xfun::attr2()' instead.
See help("Deprecated")
Data summary
Name TEDS_2016
Number of rows 1690
Number of columns 54
_______________________
Column type frequency:
numeric 54
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
District 0 1.00 4660.58 2652.46 201 1401 6406.0 6604 6806 ▃▁▁▁▇
Sex 0 1.00 1.49 0.50 1 1 1.0 2 2 ▇▁▁▁▇
Age 0 1.00 3.30 1.44 1 2 3.0 5 5 ▅▅▅▆▇
Edu 0 1.00 3.33 1.55 1 2 3.0 5 9 ▆▇▇▁▁
Arear 0 1.00 2.74 1.56 1 1 3.0 4 6 ▇▃▃▂▁
Career 0 1.00 2.68 1.43 1 1 2.0 4 5 ▇▆▂▇▃
Career8 0 1.00 3.81 1.98 1 2 4.0 5 8 ▇▅▇▁▅
Ethnic 0 1.00 1.66 1.50 1 1 1.0 2 9 ▇▁▁▁▁
Party 0 1.00 13.02 9.93 1 5 7.0 25 26 ▇▂▁▁▇
PartyID 0 1.00 4.52 3.55 1 2 2.0 9 9 ▇▁▁▁▅
Tondu 0 1.00 4.13 1.77 1 3 4.0 5 9 ▂▇▃▁▁
Tondu3 0 1.00 2.67 1.86 1 2 2.0 3 9 ▇▃▁▁▁
nI2 585 0.65 1.85 0.82 1 1 2.0 3 3 ▇▁▆▁▅
votetsai 429 0.75 0.63 0.48 0 0 1.0 1 1 ▅▁▁▁▇
green 0 1.00 0.38 0.49 0 0 0.0 1 1 ▇▁▁▁▅
votetsai_nm 429 0.75 0.63 0.48 0 0 1.0 1 1 ▅▁▁▁▇
votetsai_all 248 0.85 0.55 0.50 0 0 1.0 1 1 ▆▁▁▁▇
Independence 0 1.00 0.29 0.45 0 0 0.0 1 1 ▇▁▁▁▃
Unification 0 1.00 0.12 0.33 0 0 0.0 0 1 ▇▁▁▁▁
sq 0 1.00 0.52 0.50 0 0 1.0 1 1 ▇▁▁▁▇
Taiwanese 0 1.00 0.63 0.48 0 0 1.0 1 1 ▅▁▁▁▇
edu 10 0.99 3.30 1.49 1 2 3.0 5 5 ▅▂▆▂▇
female 0 1.00 0.49 0.50 0 0 0.0 1 1 ▇▁▁▁▇
whitecollar 0 1.00 0.54 0.50 0 0 1.0 1 1 ▇▁▁▁▇
lowincome 0 1.00 4.34 0.82 1 4 5.0 5 5 ▁▁▁▆▇
income 0 1.00 5.32 2.74 1 3 5.5 7 10 ▅▃▇▃▃
income_nm 330 0.80 5.28 3.05 1 2 5.0 8 10 ▇▅▆▆▆
age 0 1.00 49.11 16.81 20 35 49.0 61 100 ▇▇▇▃▁
KMT 0 1.00 0.23 0.42 0 0 0.0 0 1 ▇▁▁▁▂
DPP 0 1.00 0.35 0.48 0 0 0.0 1 1 ▇▁▁▁▅
npp 0 1.00 0.03 0.16 0 0 0.0 0 1 ▇▁▁▁▁
noparty 0 1.00 0.37 0.48 0 0 0.0 1 1 ▇▁▁▁▅
pfp 0 1.00 0.02 0.14 0 0 0.0 0 1 ▇▁▁▁▁
South 0 1.00 0.49 0.50 0 0 0.0 1 1 ▇▁▁▁▇
north 0 1.00 0.48 0.50 0 0 0.0 1 1 ▇▁▁▁▇
Minnan_father 0 1.00 0.72 0.45 0 0 1.0 1 1 ▃▁▁▁▇
Mainland_father 0 1.00 0.10 0.30 0 0 0.0 0 1 ▇▁▁▁▁
Econ_worse 0 1.00 0.55 0.50 0 0 1.0 1 1 ▆▁▁▁▇
Inequality 0 1.00 0.94 0.25 0 1 1.0 1 1 ▁▁▁▁▇
inequality5 0 1.00 4.49 0.73 1 4 5.0 5 5 ▁▁▁▅▇
econworse5 0 1.00 3.64 0.78 1 3 4.0 4 5 ▁▁▇▇▂
Govt_for_public 0 1.00 0.42 0.49 0 0 0.0 1 1 ▇▁▁▁▆
pubwelf5 0 1.00 2.88 1.17 1 2 3.0 4 5 ▂▇▂▇▁
Govt_dont_care 0 1.00 0.50 0.50 0 0 0.0 1 1 ▇▁▁▁▇
highincome 330 0.80 0.58 0.49 0 0 1.0 1 1 ▆▁▁▁▇
votekmt 0 1.00 0.21 0.40 0 0 0.0 0 1 ▇▁▁▁▂
votekmt_nm 429 0.75 0.28 0.45 0 0 0.0 1 1 ▇▁▁▁▃
Blue 0 1.00 0.00 0.00 0 0 0.0 0 0 ▁▁▇▁▁
Green 0 1.00 0.00 0.00 0 0 0.0 0 0 ▁▁▇▁▁
No_Party 0 1.00 0.00 0.00 0 0 0.0 0 0 ▁▁▇▁▁
voteblue 0 1.00 0.28 0.45 0 0 0.0 1 1 ▇▁▁▁▃
voteblue_nm 429 0.75 0.37 0.48 0 0 0.0 1 1 ▇▁▁▁▅
votedpp_1 187 0.89 0.53 0.50 0 0 1.0 1 1 ▇▁▁▁▇
votekmt_1 187 0.89 0.23 0.42 0 0 0.0 0 1 ▇▁▁▁▂
############################################################
# 5. Recode Tondu Variable
############################################################

TEDS_2016$Tondu <- factor(
  TEDS_2016$Tondu,
  levels = 1:7,
  labels = c(
    "Unification now",
    "Status quo, unif. in future",
    "Status quo, decide later",
    "Status quo forever",
    "Status quo, indep. in future",
    "Independence now",
    "No response"
  )
)

############################################################
# 6. Frequency Table
############################################################

tabyl(TEDS_2016, Tondu)
                        Tondu   n    percent valid_percent
              Unification now  27 0.01597633    0.01720841
  Status quo, unif. in future 180 0.10650888    0.11472275
     Status quo, decide later 546 0.32307692    0.34799235
           Status quo forever 328 0.19408284    0.20905035
 Status quo, indep. in future 380 0.22485207    0.24219248
             Independence now 108 0.06390533    0.06883365
                  No response   0 0.00000000    0.00000000
                         <NA> 121 0.07159763            NA
############################################################
# 7. Visualization
############################################################

ggplot(TEDS_2016, aes(x = Tondu)) +
  geom_bar(fill = "steelblue") +
  theme_minimal() +
  labs(title = "Distribution of Tondu",
       x = "Tondu Position",
       y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

############################################################
# 8. Regression Analysis (OLS)
############################################################

TEDS_reg <- TEDS_2016 %>%
  dplyr::select(Tondu, female, DPP, age, income, edu, Taiwanese, Econ_worse) %>%
  drop_na()

model1 <- lm(as.numeric(Tondu) ~ female + DPP + age + income + 
               edu + Taiwanese + Econ_worse,
             data = TEDS_reg)

summary(model1)

Call:
lm(formula = as.numeric(Tondu) ~ female + DPP + age + income + 
    edu + Taiwanese + Econ_worse, data = TEDS_reg)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.88351 -0.89814  0.00172  0.83385  3.15877 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  3.572392   0.179977  19.849  < 2e-16 ***
female       0.054122   0.054308   0.997    0.319    
DPP          0.524908   0.059906   8.762  < 2e-16 ***
age         -0.009040   0.002090  -4.324 1.63e-05 ***
income      -0.008403   0.010395  -0.808    0.419    
edu         -0.003484   0.024434  -0.143    0.887    
Taiwanese    0.753306   0.060913  12.367  < 2e-16 ***
Econ_worse  -0.037809   0.055702  -0.679    0.497    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.061 on 1555 degrees of freedom
Multiple R-squared:  0.2082,    Adjusted R-squared:  0.2046 
F-statistic:  58.4 on 7 and 1555 DF,  p-value: < 2.2e-16