---
title: "Descriptive Statistics from the SK Multicultural Data V2"
format:
html:
self-contained: true
number-sections: true
toc: true
toc-location: left
toc-depth: 3
code-fold: false
collapse: true
code-tools: true
editor: visual
---
```{r set_environment}
#| echo: FALSE
#| results: "hide"
#| message: FALSE
library(tidyverse)
library(plotly)
library(haven)
setwd("~/Dropbox/MEA-THL/Multicultural/code")
```
## Using Multicultural Household Survey {.panel-tabset .panel-tabset-closed}
```{r load}
#| echo: FALSE
#| message: FALSE
fam_2015 <- read_dta("/Users/tlim/Dropbox/Data/전국다문화가족실태조사/가구구성원_2015.dta")%>%
rename_with(.fn = ~ ifelse(.x == "v1", .x, paste0(.x, "_fam"))) %>%
rename(hhid = v3_fam)
fam_2018 <- read_dta("/Users/tlim/Dropbox/Data/전국다문화가족실태조사/가구구성원_2018.dta")%>%
rename_with(.fn = ~ ifelse(.x == c("v1", "v2"), .x, paste0(.x, "_fam")))
fam_2021 <- read_dta("/Users/tlim/Dropbox/Data/전국다문화가족실태조사/가구구성원_2021.dta")%>%
rename_with(.fn = ~ ifelse(.x == "v1", .x, paste0(.x, "_fam")))
tab_2015 <- read_dta("/Users/tlim/Dropbox/Data/전국다문화가족실태조사/가구구성표_2015.dta")%>%
rename_with(.fn = ~ ifelse(.x == "v1", .x, paste0(.x, "_tab")))%>%
rename(hhid = v1)
tab_2018 <- read_dta("/Users/tlim/Dropbox/Data/전국다문화가족실태조사/가구구성표_2018.dta")%>%
rename_with(.fn = ~ ifelse(.x == "v1", .x, paste0(.x, "_tab")))
tab_2021 <- read_dta("/Users/tlim/Dropbox/Data/전국다문화가족실태조사/가구구성표_2021.dta")%>%
rename_with(.fn = ~ ifelse(.x == "v1", .x, paste0(.x, "_tab")))
self_2015 <- read_dta("/Users/tlim/Dropbox/Data/전국다문화가족실태조사/본인_2015.dta")%>%
rename_with(.fn = ~ ifelse(.x == "v1", .x, paste0(.x, "_self")))%>%
rename(hhid = v2_self)
self_2018 <- read_dta("/Users/tlim/Dropbox/Data/전국다문화가족실태조사/본인_2018.dta")%>%
rename_with(.fn = ~ ifelse(.x == c("v1", "v2"), .x, paste0(.x, "_self")))
self_2021 <- read_dta("/Users/tlim/Dropbox/Data/전국다문화가족실태조사/본인_2021.dta")%>%
rename_with(.fn = ~ ifelse(.x == "v1", .x, paste0(.x, "_self")))
```
```{r merge}
#| echo: false
#| message: FALSE
df_2015 <- full_join(fam_2015, tab_2015) %>%
full_join(self_2015) %>%
mutate(wave="2015") %>%
rename(id = v1,
relation = v4_fam,
entry_yr = v16_fam,
naturalized_bi = v17_fam,
naturalize_yr = v18_fam,
nationality_birth = v14_fam,
income = v8_tab,
weight = v10_tab)
df_2018 <- full_join(fam_2018, tab_2018) %>%
full_join(self_2018) %>%
mutate(wave="2018") %>%
rename(hhid = v1,
relation = v4_fam,
entry_yr = v16_fam,
naturalized_bi = v17_fam,
naturalize_yr = v18_fam,
nationality_birth = v12_fam,
income = v36_tab,
weight = v38_tab)
df_2021 <- full_join(fam_2021, tab_2021) %>%
full_join(self_2021) %>%
mutate(wave="2021") %>%
rename(hhid = v1,
relation = v3_fam,
entry_yr = v18_fam,
naturalized_bi = v19_fam,
naturalize_yr = v20_fam,
nationality_birth = v13_fam,
income = v8_tab,
weight = v11_tab)
```
### Number of Naturalized Respondents in the Sample
<!-- <details> -->
2 1 \[ 귀화연도 있음\] or 2 1 \[ 1. 해당\]
```{r naturalize}
#| echo: FALSE
#| message: FALSE
nat_2015 <- df_2015 %>% filter(relation == 1) %>% count(naturalize_yr)
df_2015 %>% filter(relation == 1) %>% count(naturalized_bi) ##귀화여부
df_2015 <- df_2015 %>%
mutate(years_kr = as.numeric(naturalize_yr) - as.numeric(entry_yr))
yrs_2015 <- df_2015 %>% filter(relation == 1) %>% count(years_kr)
yrs_2015_pre <- df_2015 %>% filter(relation == 1) %>% filter(entry_yr < 2011) %>% count(years_kr)
yrs_2015_post <- df_2015 %>% filter(relation == 1) %>% filter(entry_yr > 2011) %>% count(years_kr)
nat_2018 <- df_2018 %>% filter(relation == 1) %>% count(naturalize_yr)
df_2018 %>% filter(relation == 1) %>% count(naturalized_bi) ##귀화여부
df_2018 <- df_2018 %>%
mutate(years_kr = as.numeric(naturalize_yr) - as.numeric(entry_yr))
yrs_2018 <- df_2018 %>% filter(relation == 1) %>% count(years_kr)
yrs_2018_pre <- df_2018 %>% filter(relation == 1) %>% filter(entry_yr < 2011) %>% count(years_kr)
yrs_2018_post <- df_2018 %>% filter(relation == 1) %>% filter(entry_yr > 2011) %>% count(years_kr)
nat_2021 <- df_2021 %>% filter(relation == 1) %>% count(naturalize_yr)
df_2021 %>% filter(relation == 1) %>% count(naturalized_bi) ##귀화여부
df_2021 <- df_2021 %>%
mutate(years_kr = as.numeric(naturalize_yr) - as.numeric(entry_yr))
yrs_2021 <- df_2021 %>% filter(relation == 1) %>% count(years_kr)
yrs_2021_pre <- df_2021 %>% filter(relation == 1) %>% filter(entry_yr < 2011) %>% count(years_kr)
yrs_2021_post <- df_2021 %>% filter(relation == 1) %>% filter(entry_yr > 2011) %>% count(years_kr)
```
```{r filter_n_merge}
#| echo: FALSE
#| message: FALSE
df_slim_2015 <- df_2015 %>%
select(hhid, relation, naturalize_yr, naturalized_bi, nationality_birth, entry_yr, years_kr, income, weight, wave) %>%
mutate(relation = as.numeric(relation),
naturalized_bi = as.numeric(naturalized_bi),
nationality_birth = as.numeric(nationality_birth),
income = as.numeric(income))
df_slim_2018 <- df_2018 %>%
select(hhid, relation, naturalize_yr, naturalized_bi, nationality_birth, entry_yr, years_kr, income, weight, wave) %>%
mutate(relation = as.numeric(relation),
naturalized_bi = as.numeric(naturalized_bi),
nationality_birth = as.numeric(nationality_birth),
income = as.numeric(income))
df_slim_2021 <- df_2021 %>%
select(hhid, relation, naturalize_yr, naturalized_bi, nationality_birth, entry_yr, years_kr, income, weight, wave)
label_info <- tibble(
varname = names(df_slim_2021),
# Get labels using val_labels(), handling NULLs, and convert to tibbles immediately
label_data = lapply(df_slim_2021, function(col) {
# *** Use val_labels() instead of labels() ***
lbls <- labelled::val_labels(col)
if (!is.null(lbls) && length(lbls) > 0) {
# Convert the named vector into a tibble with 'label' and 'code' columns
tibble::enframe(lbls, name = "label", value = "code")
} else {
# Return NULL if no labels, so it can be filtered out
NULL
}
})
) %>%
# Filter out variables that didn't have labels (where label_data is NULL)
filter(!sapply(label_data, is.null)) %>%
# Now unnest the tibbles contained in the label_data list-column
unnest(label_data) %>%
# Select the desired columns (already correctly named)
select(varname, code, label)
df_slim_2021 <- df_slim_2021 %>%
mutate(relation = as.numeric(relation),
naturalized_bi = as.numeric(naturalized_bi),
nationality_birth = as.numeric(nationality_birth),
income = as.numeric(income))
df_slim <- rbind(df_slim_2015, df_slim_2018) %>%
rbind(df_slim_2021)
```
<!-- </details> -->
### Number of Naturalizations by Year
<!-- <details> -->
<!-- <summary>Figure</summary> -->
```{r plot_naturalize}
#| echo: FALSE
plot_naturalize <- df_slim %>%
filter(relation == 1, !is.na(naturalize_yr)) %>%
mutate(naturalize_yr = as.numeric(as.character(naturalize_yr))) %>%
filter(!is.na(naturalize_yr)) %>%
group_by(naturalize_yr) %>%
summarize(weighted_n = sum(weight, na.rm = TRUE)) %>%
ggplot(aes(x = naturalize_yr, y = weighted_n)) +
geom_line() +
geom_point() +
labs(
title = "Weighted Number of Naturalizations by Year",
x = "Year of Naturalization",
y = "Weighted Frequency"
) +
geom_vline(xintercept = c(2010, 2011), linetype = "dashed", color = "red") +
scale_x_continuous(breaks = c(1940, 1960, 1980, 2000, 2010, 2020)) +
xlim(1996,2021) +
theme_minimal()
ggplotly(plot_naturalize)
#ggsave("plot_naturalize.png", plot = plot_naturalize, width = 8, height = 6, dpi = 300)
```
<!-- </details> -->
### By Allowing Dual Citizenship
<!-- <details> -->
<!-- <summary>Figure</summary> -->
Created a data with respondents from Vietnam, Philippines, Thailand, Japan, Mongolia, and China. Dual indicates 1 if the respondent is from Vietnam, Philippines, or Thailand and 0 if the respondent is from Japan, Mongolia, or China.
```{r dual_citizenship_data}
#| echo: FALSE
#| message: FALSE
#The 2011 reform would only affect those marriage migrants whose home countries allow dual citizenship. Vietnam, Philippines and Thailand do, but not Japan, Mongolia, China.
df_dual <- df_slim %>%
filter(nationality_birth %in% c(112, 113, 120, 122, 130, 145, 155, 170, 185)) %>% # 중국 112, 대만 113, 홍콩 120, 한국계 중국인 122, 일본 130, 몽골 145, 필리핀 155, 태국(타이) 170, 베트남 185
mutate(
dual = ifelse(nationality_birth %in% c(155, 170, 185), 1, 0)
)
```
```{r dual_citizenship_figure}
#| echo: FALSE
#| message: FALSE
plot_dual <- df_dual %>%
filter(relation == 1) %>%
filter(!is.na(naturalize_yr)) %>%
mutate(naturalize_yr = as.numeric(as.character(naturalize_yr))) %>%
filter(!is.na(naturalize_yr)) %>%
group_by(dual, naturalize_yr) %>%
summarise(weighted_n = sum(weight), na.rm = TRUE, .groups = 'drop') %>%
mutate(dual = factor(dual, levels = c(0, 1), labels = c("Not Allowed", "Allowed"))) %>%
ggplot(aes(x = naturalize_yr, y = weighted_n, color = dual, group = dual)) +
geom_line() +
geom_point() +
labs(
title = "Number of Naturalizations by Year, by Dual Citizenship Status",
x = "Year of Naturalization",
y = "Weighted Frequency",
color = "Dual Citizenship" # Legend title (adjust if needed)
) +
geom_vline(xintercept = c(2005, 2010, 2011), linetype = "dashed", color = "red") +
scale_x_continuous(breaks = c(1940, 1960, 1980, 2000, 2005, 2010, 2020)) +
xlim(1996,2021) +
theme_minimal() +
theme(legend.position = "bottom")
plotly_dual <- ggplotly(plot_dual)
# 3. Modify the plotly object's layout to move the legend
plotly_dual_final <- plotly_dual %>%
layout(legend = list(
orientation = "h", # "h" for horizontal orientation
xanchor = "center", # Anchor point of the legend ("auto", "left", "center", "right")
x = 0.5, # Horizontal position (0 to 1, 0.5 is center)
yanchor = "top", # Anchor point of the legend ("auto", "top", "middle", "bottom")
y = -0.2 # Vertical position (use negative values to place below plot, adjust distance)
))
# 4. Display the modified plotly object
plotly_dual_final
```
<!-- </details> -->
### By Country
<!-- <details> -->
<!-- <summary>Figure</summary> -->
China 112, Taiwan 113, Hong Kong 120, Korean Chinese 122, Japan 130, Mongolia 145, Phillippinnes 155, Thailand 170, Vietnam 185
```{r country_figure}
#| echo: FALSE
#| message: FALSE
plot_country <- df_slim %>%
filter(nationality_birth %in% c(112, 113, 120, 122, 130, 145, 155, 170, 185)) %>%
filter(relation == 1) %>%
filter(!is.na(naturalize_yr)) %>%
mutate(naturalize_yr = as.numeric(as.character(naturalize_yr))) %>%
filter(!is.na(naturalize_yr)) %>%
group_by(nationality_birth, naturalize_yr) %>%
summarise(weighted_n = sum(weight), na.rm = TRUE, .groups = 'drop') %>%
mutate(nationality_birth = factor(nationality_birth)) %>%
ggplot(aes(x = naturalize_yr, y = weighted_n, color = nationality_birth, group = nationality_birth)) +
geom_line() +
geom_point() +
labs(
title = "Number of Naturalizations by Year, by Country",
x = "Year of Naturalization",
y = "Weighted Frequency",
color = "Nationality at Birth"
) +
geom_vline(xintercept = c(2005, 2010, 2011), linetype = "dashed", color = "red") +
scale_x_continuous(breaks = c(1940, 1960, 1980, 2000, 2005, 2010, 2020)) +
xlim(1996,2021) +
theme_minimal() +
theme(legend.position = "bottom")
plotly_country <- ggplotly(plot_country)
# 3. Modify the plotly object's layout to move the legend
plotly_country_final <- plotly_country %>%
layout(legend = list(
orientation = "h", # "h" for horizontal orientation
xanchor = "center", # Anchor point of the legend ("auto", "left", "center", "right")
x = 0.5, # Horizontal position (0 to 1, 0.5 is center)
yanchor = "top", # Anchor point of the legend ("auto", "top", "middle", "bottom")
y = -0.2 # Vertical position (use negative values to place below plot, adjust distance)
))
# 4. Display the modified plotly object
plotly_country_final
```
<!-- </details> -->
### By Income
<!-- <details> -->
<!-- <summary>Figure</summary> -->
Do we observe that the changes particularly helped marriage migrants with lower language ability or from poorer households?
```{r income_plot}
#| echo: FALSE
#| message: FALSE
plot_income <- df_slim %>%
filter(relation == 1) %>%
filter(!is.na(naturalize_yr)) %>%
mutate(naturalize_yr = as.numeric(as.character(naturalize_yr)),
income_bi = ifelse(income < 3, 0, 1),
income = factor(income)) %>%
filter(!is.na(naturalize_yr)) %>%
group_by(income, naturalize_yr) %>%
summarise(weighted_n = sum(weight), na.rm = TRUE, .groups = 'drop') %>%
ggplot(aes(x = naturalize_yr, y = weighted_n, color = income, group = income)) +
geom_line() +
geom_point() +
labs(
title = "Number of Naturalizations by Year, by Income Groups",
x = "Year of Naturalization",
y = "Weighted Frequency",
color = "Income Groups" # Legend title (adjust if needed)
) +
geom_vline(xintercept = c(2005, 2010, 2011, 2018), linetype = "dashed", color = "red") +
scale_x_continuous(breaks = c(1940, 1960, 1980, 2000, 2005, 2010, 2018, 2020)) +
xlim(1996,2021) +
theme_minimal() +
theme(legend.position = "bottom")
plotly_income <- ggplotly(plot_income)
# 3. Modify the plotly object's layout to move the legend
plotly_income_final <- plotly_income %>%
layout(legend = list(
orientation = "h", # "h" for horizontal orientation
xanchor = "center", # Anchor point of the legend ("auto", "left", "center", "right")
x = 0.5, # Horizontal position (0 to 1, 0.5 is center)
yanchor = "top", # Anchor point of the legend ("auto", "top", "middle", "bottom")
y = -0.2 # Vertical position (use negative values to place below plot, adjust distance)
))
# 4. Display the modified plotly object
plotly_income_final
```
<!-- </details> -->
<details>
<summary>Income Table</summary>
```{r income_table}
#| echo: FALSE
#| message: FALSE
label_info %>%
filter(varname=="income") %>%
mutate(`Monthly Income`=case_when(
code == 1 ~ "$1,000",
code == 2 ~ "$2,000",
code == 3 ~ "$3,000",
code == 4 ~ "$4,000",
code == 5 ~ "$5,000",
code == 6 ~ "$6,000",
code == 7 ~ "$7,000",
code == 8 ~ "$8,000",
code == 9 ~ "over $8,000",
TRUE ~ NA_character_
))
```
</details>
## Using Government Statistics {.panel-tabset .panel-tabset-closed}
### Number of Naturalized Aliens by Year
<!-- <details> -->
<!-- <summary>Figure</summary> -->
Data from [ index.go.kr, 지표누리 e-나라지표 국적통계 추이 ](https://www.index.go.kr/unity/potal/main/EachDtlPageDetail.do?idx_cd=1760) .
```{r stat_naturalized}
#| echo: FALSE
#| message: FALSE
library(readxl)
yearly_naturalize <- read_excel("~/Dropbox/MEA-THL/Multicultural/data/yearly_naturalize.xlsx") %>%
mutate(year = as.numeric(year),
naturalize = as.numeric(naturalize),
recover = as.numeric(recover))
plot_stat_naturalize <- yearly_naturalize %>%
ggplot(aes(x = year, y = naturalize)) +
geom_line() +
geom_point() +
labs(title = "Number of Naturalizations by Year",
x = "Year of Naturalization",
y = "Frequency") +
geom_vline(xintercept = c(2005, 2010, 2011, 2014, 2020), linetype = "dashed", color = "red") +
scale_x_continuous(breaks = c(1940, 1960, 1980, 2000, 2010, 2020)) +
theme_minimal()
ggplotly(plot_stat_naturalize)
```
<!-- </details> -->
### Number of Naturalized Aliense by Year by Allowing Dual Citizenship
<!-- <details> -->
<!-- <summary>Figure</summary> -->
Data from [ data.go.kr, 공공데이터포털 법무부_55(국적) 연도별 국적취득 국적(지역)별 현황 ](https://www.data.go.kr/data/15100047/fileData.do) .
I summed naturalization (귀화) and recovering nationality (국적회복) to get the frequency.
```{r stat_dual}
#| echo: FALSE
#| message: FALSE
#| warning: FALSE
country_natrualize <- read_csv("~/Dropbox/MEA-THL/Multicultural/data/yearly_country_naturalize.csv", locale = locale(encoding = "CP949")) %>%
rename(
year = 년,
type = 유형,
country = 국적,
frequency = 건수
) %>%
mutate(
dual = case_when(
country %in% c("베트남", "필리핀", "태국") ~ 1,
country %in% c("중국", "타이완", "한국계 중국인", "홍콩", "일본", "몽골") ~ 0,
TRUE ~ NA_real_
),
)
plot_dual <- country_natrualize %>%
filter(!is.na(dual)) %>%
group_by(dual, year) %>%
summarise(n = sum(frequency, na.rm = TRUE), .groups = 'drop') %>%
mutate(dual = factor(dual, levels = c(0, 1), labels = c("Not Allowed", "Allowed"))) %>%
ggplot(aes(x = year, y = n, color = dual, group = dual)) +
geom_line() +
geom_point() +
labs(
title = "Number of Naturalizations by Year, by Dual Citizenship Status",
x = "Year of Naturalization",
y = "Frequency",
color = "Dual Citizenship"
) +
scale_x_continuous(breaks = c(2011, 2014, 2017, 2020)) +
ylim(0, 5800) +
theme_minimal() +
theme(legend.position = "bottom")
plotly_dual <- ggplotly(plot_dual)
# 3. Modify the plotly object's layout to move the legend
plotly_dual_final <- plotly_dual %>%
layout(legend = list(
orientation = "h", # "h" for horizontal orientation
xanchor = "center", # Anchor point of the legend ("auto", "left", "center", "right")
x = 0.5, # Horizontal position (0 to 1, 0.5 is center)
yanchor = "top", # Anchor point of the legend ("auto", "top", "middle", "bottom")
y = -0.2 # Vertical position (use negative values to place below plot, adjust distance)
))
# 4. Display the modified plotly object
plotly_dual_final
```
<!-- </details> -->