College Entrance Exam Major Selection: A Case Study of Shandong Province

Published: July 04, 2024

高考志愿填报趋势大揭秘——以山东省为例

College Entrance Exam Major Selection: A Case Study of Shandong Province

Abstract

The article analyzes changes in college application preferences among high school graduates in Shandong Province from 2020 to 2023. Shandong Province was chosen because of its large number of test-takers and the standardized data format available from the local education examination authority.

The analysis involved standardizing the admission scores of different majors within the same university on a scale of 1-100. The article then categorized the admission scores into high, medium-high, medium-low, and low segments to compare the popularity of various majors over the years.

Conclusions

Popular majors for lower-ranking students remained relatively stable, while those for higher-ranking students changed significantly.
From 2020 to 2023, the popularity of lesser-known language majors decreased, while computer science, software engineering, and electronic information majors increased in popularity.
Top universities showed stable changes in major popularity, while mid-tier universities experienced significant shifts.
Many majors at Xiamen University saw a significant rise in popularity, while majors at Central South University and Beijing Foreign Studies University showed a notable decline.

Read more: WeChat (in Chinese)

knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir = "/Users/sousekilyu/Documents/GitHub/GaoKaoVer2")

Data preparation

source("~/Documents/GitHub/GaoKaoVer2/main/etl.R")

## Joining with `by = join_by(school)`

source("/Users/sousekilyu/Documents/GitHub/GaoKaoVer2/main/function.r")

plot of chunk unnamed-chunk-6

ggsaveTheme(p,
    mytheme = my_theme_legend,
    filename = "plot/Figure 3-1.score_by_major_rough_change.png",
    width = 16,
    height = 12,
    dpi = 300
)

热门高校变化

重点高校专业热度变化 / 高分段学校的低分段专业，同

# !考虑到学校最低分收到专业极大影响，院校位次分数根据中位数排名，而非最低位次
dt_school_top <- dt_rank_cmb %>%
    mutate(school = substr(院校, 5, nchar(院校))) %>%
    filter(year == 2023) %>%
    mutate(rank = dense_rank(desc(score_by_school_scale))) %>%
    # filter(rank <= 30) %>%
    ungroup()
dt_school_top_change <- score_by_major_rough_change %>%
    filter(院校 %in% dt_school_top$院校) %>%
    left_join(unique(select(dt_school_top, 院校, score_by_school_scale, school, rank)),
        by = "院校"
    )
head(dt_school_top_change)

## # A tibble: 6 × 12
##   院校      major province city  countn score_by_major_early score_by_major_later score_by_major_change major_rough score_by_school_scale school  rank
##   <chr>     <chr> <chr>    <chr>  <int>                <dbl>                <dbl>                 <dbl> <chr>                       <dbl> <chr>  <int>
## 1 D904北京… 计算… 北京市   北京…      2                 12.5                 75.4                  63.0 计算机类                     31.8 北京…    685
## 2 D601长春… 软件… 吉林省   长春…      2                 13.2                 72.6                  59.4 软件工程                     12.9 长春…    970
## 3 D905南京… 计算… 江苏省   南京…      2                 25.1                 78.8                  53.7 计算机类                     19.2 南京…    870
## 4 D991江苏… 能源… 江苏省   苏州…      2                 22.4                 71.0                  48.6 能源与动力…                  32.1 江苏…    682
## 5 D897天津… 数据… 天津市   天津…      2                 19.5                 67.3                  47.8 数据科学与…                  19.4 天津…    868
## 6 D897天津… 软件… 天津市   天津…      2                 20.8                 66.7                  45.9 软件工程                     19.4 天津…    868

p_school_change <- dt_school_top_change %>%
    filter(rank <= 50) %>%
    ggplot(aes(
        x = score_by_major_change,
        y = reorder(school, score_by_school_scale),
        color = ifelse(score_by_major_change > 0, "#00BA38", "#F8756D")
    )) +
    geom_point(size = 5, alpha = .5) +
    scale_color_identity() +
    # scale_x_log10() +
    theme_bw() +
    theme(text = element_text(family = "Canger", size = 10)) +
    labs(title = "Popularity Changes in Top 50 Unis' Majors, 2020-2023", x = "Δ Popularity", y = "Universities")
# save png
print(p_school_change)

plot of chunk unnamed-chunk-8

ggsaveTheme(p_school_change,
    mytheme = my_theme,
    filename = "plot/Figure 4-1.top_uni_change_by_major.png",
    width = 12,
    height = 16,
    dpi = 300
)

# # zoom out
# p_school_change_zoom <- dt_school_top_change %>%
#     filter(rank <= 50) %>%
#     ggplot(aes(
#         x = score_by_major_change,
#         y = reorder(school, score_by_school_scale),
#         color = ifelse(score_by_major_change > 0, "#00BA38", "#F8756D")
#     )) +
#     geom_point(size = 5, alpha = .5) +
#     scale_color_identity() +
#     coord_cartesian(xlim = c(-5, 5)) +
#     # scale_x_log10() +
#     theme_bw() +
#     theme(text = element_text(family = "Canger", size = 10)) +
#     labs(title = "Popularity Changes in Top 50 Universities' Majors, 2020-2023", x = "Δ Popularity (Zoom out)", y = "Universities")
# # save png
# print(p_school_change_zoom)
# ggsaveTheme(p_school_change_zoom,
#     mytheme = my_theme,
#     filename = "plot/Figure 7.top_uni_change_by_major_zoom.png",
#     width = 12,
#     height = 16,
#     dpi = 300
# )

Top50 cases

# Filter out the schools with the most significant changes in major scores.
.school_major <- dt_school_top_change %>%
    dplyr::select(school, rank, major, major_rough, score_by_major_change, score_by_school_scale) %>% 
    mutate(delta = ifelse(score_by_major_change > 0, 1, 0)) %>% 
    # top50
    filter(rank <= 50) %>%
    group_by(delta, school)  %>% 
    mutate(avg_majors_scores = mean(score_by_major_change, na.rm = TRUE))  %>% 
    ungroup() %>% 
    # 根据专业变化平均分对院校排序
    group_by(delta) %>% 
    mutate(rank_avg = dense_rank(desc(avg_majors_scores))) %>% 
    ungroup()

# up 5
 p_up <- .school_major %>% 
    dplyr::arrange(desc(avg_majors_scores), desc(score_by_major_change)) %>% 
    filter(delta == 1, 
    #between(avg_majors_scores, -0.2, 0.2),
    between(rank_avg, 1, 10)) %>% 
    ggplot(aes(
        x = score_by_major_change,
        y = reorder(school, avg_majors_scores),
        color = ifelse(score_by_major_change > 0, "#00BA38", "#F8756D")
    )) +
        geom_point(size = 5, alpha = .5) +
        scale_color_identity() +
        coord_cartesian(xlim = c(0, 8)) +
        geom_text_repel(aes(
            label = major
        ), 
        #angle = 45,
        vjust = -0.5,
        alpha = .7,
        min.segment.length = Inf,
        max.overlaps = 20,
        size = 15,
        family = "Canger",
        color = 'black')  +
        # scale_x_log10() +
        # coord_flip() +
        theme_bw() +
        theme(text = element_text(family = "Canger", size = 10)) +
        labs(title = "Popularity Changes in Top Unis' Majors, 2020-2023", x = "Δ Popularity (Zoom out)", y = "Universities")
#print(p_up)
ggsaveTheme(p_up,
    mytheme = my_theme,
    filename = "plot/Figure 4-2.up_major_names.png",
    width = 12,
    height = 16,
    dpi = 300
)

## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider increasing max.overlaps

# down 5
 p_down <- .school_major %>%
     arrange(avg_majors_scores, score_by_major_change) %>% 
     filter(delta == 0, 
     #between(avg_majors_scores, -0.2, 0.2),
     between(rank_avg, length(unique(.school_major$rank_avg))-9, length(unique(.school_major$rank_avg)))) %>%
     ggplot(aes(
        x = score_by_major_change,
        y = reorder(school, avg_majors_scores),
        color = ifelse(score_by_major_change > 0, "#00BA38", "#F8756D")
    )) +
        geom_point(size = 5, alpha = .5) +
        scale_color_identity() +
        coord_cartesian(xlim = c(-10, 0)) +
        geom_text_repel(
            aes(
                label = major
            ),
            min.segment.length = Inf,
            max.overlaps = 20,
            size = 15,
            family = "Canger",
            color = "black",
            #angle = 45,
            vjust = -0.5,
        alpha = .7
        ) +
        # scale_x_log10() +
        #coord_flip() +
        theme_bw() +
        theme(text = element_text(family = "Canger", size = 10)) +
        labs(title = "Popularity Changes in Top Unis' Majors, 2020-2023", x = "Δ Popularity (Zoom out)", y = "Universities")
#print(p_down)
ggsaveTheme(p_down,
    mytheme = my_theme,
    filename = "plot/Figure 4-3.down_major_names.png",
    width = 12,
    height = 16,
    dpi = 300
)

## Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider increasing max.overlaps

#! comment for more details
 p_up2 <- .school_major %>%
     dplyr::arrange(desc(avg_majors_scores), desc(score_by_major_change)) %>%
     filter(delta == 1) %>%
     ggplot(aes(
         x = score_by_major_change,
         y = reorder(school, avg_majors_scores),
         color = ifelse(score_by_major_change > 0, "#00BA38", "#F8756D")
     )) +
     geom_point(size = 5, alpha = .5) +
     scale_color_identity() +
     coord_cartesian(xlim = c(0, 8)) +
     geom_text_repel(
         aes(
             label = major
         ),
         # angle = 45,
         vjust = -0.5,
         alpha = .7,
         min.segment.length = Inf,
         max.overlaps = 15,
         size = 7,
         family = "Canger",
         color = "black"
     ) +
     # scale_x_log10() +
     # coord_flip() +
     theme_bw() +
     theme(text = element_text(family = "Canger", size = 10)) +
     labs(title = "Popularity Changes in Top Unis' Majors, 2020-2023", x = "Δ Popularity (Zoom out)", y = "Universities")
 print(p_up2)

## Warning: ggrepel: 153 unlabeled data points (too many overlaps). Consider increasing max.overlaps

plot of chunk unnamed-chunk-9

 ggsaveTheme(p_up2,
     mytheme = my_theme,
     filename = "plot/Figure 4-4.png",
     width = 12,
     height = 20,
     dpi = 300
 )

## Warning: ggrepel: 63 unlabeled data points (too many overlaps). Consider increasing max.overlaps

 p_down2 <- .school_major %>%
     arrange(avg_majors_scores, score_by_major_change) %>%
     filter(delta == 0) %>%
     ggplot(aes(
         x = score_by_major_change,
         y = reorder(school, avg_majors_scores),
         color = ifelse(score_by_major_change > 0, "#00BA38", "#F8756D")
     )) +
     geom_point(size = 5, alpha = .5) +
     scale_color_identity() +
     coord_cartesian(xlim = c(-5, 0)) +
     geom_text_repel(
         aes(
             label = major
         ),
         min.segment.length = Inf,
         max.overlaps = 15,
         size = 7,
         family = "Canger",
         color = "black",
         # angle = 45,
         vjust = -0.5,
         alpha = .7
     ) +
     # scale_x_log10() +
     # coord_flip() +
     theme_bw() +
     theme(text = element_text(family = "Canger", size = 10)) +
     labs(title = "Popularity Changes in Top Unis' Majors, 2020-2023", x = "Δ Popularity (Zoom out)", y = "Universities")
 print(p_down2)

## Warning: ggrepel: 224 unlabeled data points (too many overlaps). Consider increasing max.overlaps

plot of chunk unnamed-chunk-9

 ggsaveTheme(p_down2,
     mytheme = my_theme,
     filename = "plot/Figure 4-5.png",
     width = 12,
     height = 20,
     dpi = 300
 )

## Warning: ggrepel: 22 unlabeled data points (too many overlaps). Consider increasing max.overlaps

# #'
# #' ### 名校「冷门」专业对高分考生的吸引力

# # 名校冷门专业占比 变化不大
# dt01 <- score_by_major_group_time %>%
#     filter(school %in% project211) %>%
#     dplyr::select(school, major, major_rough, year, city, province, score_group) %>%
#     group_by(year) %>%
#     summarise(
#         fraction01 = round(sum(ifelse(major_rough %in% c("外国语言文学", "环境科学类", "公共管理类", "哲学", "新闻传播学", "建筑学类", "土木工程类", "经济学类", "金融类", "地质学"), 1, 0)) / n(), 3)
#     )
# # 对高分考生的吸引力已经减弱
# dt02 <- score_by_major_group_time %>%
#     filter(
#         major_rough %in% c("外国语言文学", "环境科学类", "公共管理类", "哲学", "新闻传播学", "建筑学类", "土木工程类", "经济学类", "金融类", "地质学"),
#         school %in% project211
#     ) %>%
#     dplyr::select(school, major, major_rough, year, city, province, score_group) %>%
#     group_by(year) %>%
#     summarise(
#         fraction02 = round(sum(ifelse(score_group %in% c("高分段"), 1, 0)) / n(), 3)
#     )
# dt_merge <- left_join(dt01, dt02) %>%
#     mutate(fraction02_plot = fraction01 * fraction02)
# real_value <- c(dt_merge$fraction01, dt_merge$fraction02)
# # plot dt_merge as barplot
# dp1 <- dt_merge %>%
#     gather(key = "variable", value = "value", -year) %>%
#     filter(variable %in% c("fraction01", "fraction02_plot")) %>%
#     data.frame(value2 = as.character(formattable::percent(real_value))) %>%
#     mutate(
#         value = formattable::percent(value),
#         value2 = ifelse(variable == "fraction02_plot", paste0(value2, " in majs."), value2)
#     ) %>%
#     ggplot(aes(x = year, y = value, fill = variable)) +
#     geom_bar(stat = "identity", position = position_dodge()) +
#     geom_text(aes(label = value2), position = position_dodge(width = 0.9), vjust = -0.25) +
#     scale_fill_manual("", values = c("#016FBC", "#E8BA00"), labels = c("fraction01" = "% of Unpop. Majs.", "fraction02_plot" = "% of High-Scoring Majs. in Unpop. Majs.")) +
#     theme_bw() +
#     theme(
#         text = element_text(family = "Canger", size = 10),
#         legend.position = "bottom"
#     ) +
#     labs(title = "% of Unpopular Majors in 985 & 211 Univs.", x = "Year", y = "Percentage %", fill = "Variable")

# print(dp1)
# dp1 <- dt_merge %>%
#     gather(key = "variable", value = "value", -year) %>%
#     filter(variable %in% c("fraction01", "fraction02_plot")) %>%
#     data.frame(value2 = as.character(formattable::percent(real_value))) %>%
#     mutate(
#         value = formattable::percent(value),
#         value2 = ifelse(variable == "fraction02_plot", paste0(value2, " in majs."), value2)
#     ) %>%
#     ggplot(aes(x = year, y = value, fill = variable)) +
#     geom_bar(stat = "identity", position = position_dodge()) +
#     #geom_text(aes(label = value2), position = position_dodge(width = 0.9), vjust = -0.25) +
#     scale_fill_manual("", values = c("#016FBC", "#E8BA00"), labels = c("fraction01" = "% of Unpop. Majs.", "fraction02_plot" = "% of High-Scoring Majs. in Unpop. Majs.")) +
#     theme_bw() +
#     theme(
#         text = element_text(family = "Canger", size = 10),
#         legend.position = "bottom"
#     ) +
#     labs(title = "% of Unpopular Majors in 985 & 211 Univs.", x = "Year", y = "Percentage %", fill = "Variable")
#  +
#     geom_text(aes(label = value2), position = position_dodge(width = 0.9), vjust = -0.25, size = 25)
# ggsaveTheme(dp1,
#     mytheme = my_theme_legend_define,
#     filename = "plot/Figure 5.percentage of unpopular majors in univs.png",
#     width = 16,
#     height = 12,
#     dpi = 300
# )

Share on

Bluesky Facebook LinkedIn X (formerly Twitter)

Felix Liu

College Entrance Exam Major Selection: A Case Study of Shandong Province

高考志愿填报趋势大揭秘——以山东省为例