Ch 03. 인구 통계 데이터 분석

지하철 데이터로 분석 사이클을 한 번 돌렸습니다. 이번에는 인구 통계 데이터를 다룹니다. 더 넓고 긴 데이터입니다.

행정안전부는 매월 주민등록 인구 현황을 공개합니다. 시도·시군구 단위의 성별·연령별 인구를 담고 있어 우리나라 인구 구조를 분석하기에 좋은 데이터입니다.

이 챕터에서 다룰 질문은 세 가지입니다. 지역별 인구 격차가 얼마나 될까? 연령 분포는 어떻게 생겼을까? 시간이 지나면서 인구는 어떻게 변해왔을까?

데이터 가져오기

공공데이터 포털(data.go.kr)에서 '주민등록인구 현황'을 검색합니다. 행정안전부에서 제공하는 '주민등록 인구 및 세대현황'을 내려받습니다. 연도별 파일이 별도로 제공되므로 원하는 연도를 선택합니다.

파일을 data/population.csv 경로에 저장한 뒤 불러옵니다.

library(tidyverse)

# 실제 파일은 EUC-KR인 경우가 많습니다
df_raw <- read_csv(
  "data/population.csv",
  locale = locale(encoding = "EUC-KR")
)

glimpse(df_raw)

실습을 위해 같은 구조의 예제 데이터를 만들겠습니다.

library(tidyverse)

set.seed(123)

regions <- c(
  "서울특별시", "부산광역시", "대구광역시", "인천광역시",
  "광주광역시", "대전광역시", "울산광역시", "세종특별자치시",
  "경기도", "강원도", "충청북도", "충청남도",
  "전라북도", "전라남도", "경상북도", "경상남도", "제주특별자치도"
)

age_groups <- paste0(seq(0, 95, 5), "~", seq(4, 99, 5), "세")

df_raw <- expand_grid(
  year      = 2015:2023,
  region    = regions,
  age_group = age_groups,
  gender    = c("남", "여")
) |>
  mutate(
    population = as.integer(
      runif(n(), 500, 50000) *
        case_when(
          region == "서울특별시" ~ 10,
          region == "경기도"     ~ 12,
          region %in% c("부산광역시", "인천광역시") ~ 5,
          TRUE ~ 2
        )
    )
  )

glimpse(df_raw)

전처리

컬럼 정리와 타입 변환

# 컬럼 이름을 간결하게 정리합니다
df <- df_raw |>
  rename(
    # 실제 파일 컬럼명에 맞게 조정하세요
    # year       = `기준연도`,
    # region     = `행정구역`,
    # age_group  = `연령구간`,
    # gender     = `성별`,
    # population = `인구수`
  )

# 예제 데이터는 그대로 사용합니다
df <- df_raw |>
  mutate(
    year       = as.integer(year),
    population = as.integer(population)
  )

# 결측치 확인
colSums(is.na(df))

시도 단위로 집계

실제 데이터는 시군구까지 세분화되어 있으므로, 시도 단위로 집계합니다.

df_region <- df |>
  group_by(year, region) |>
  summarise(
    total_pop = sum(population),
    .groups   = "drop"
  )

head(df_region)

지역별 인구 분석

시도별 인구 현황

# 가장 최근 연도의 시도별 인구를 정리합니다
latest_year <- max(df_region$year)

region_latest <- df_region |>
  filter(year == latest_year) |>
  arrange(desc(total_pop))

print(region_latest)

수도권 vs 비수도권

df_region <- df_region |>
  mutate(
    area_type = case_when(
      region %in% c("서울특별시", "경기도", "인천광역시") ~ "수도권",
      TRUE ~ "비수도권"
    )
  )

metro_compare <- df_region |>
  group_by(year, area_type) |>
  summarise(total = sum(total_pop), .groups = "drop") |>
  pivot_wider(names_from = area_type, values_from = total) |>
  mutate(
    total_all   = 수도권 + 비수도권,
    metro_ratio = round(수도권 / total_all * 100, 1)
  )

print(metro_compare)

연령별 인구 분석

연령 구간 순서 지정

# 연령 구간을 순서가 있는 factor로 변환합니다
age_levels <- paste0(seq(0, 95, 5), "~", seq(4, 99, 5), "세")

df <- df |>
  mutate(age_group = factor(age_group, levels = age_levels))

전국 연령별 인구 분포

age_dist <- df |>
  filter(year == latest_year) |>
  group_by(age_group) |>
  summarise(total = sum(population), .groups = "drop")

print(age_dist)

시각화

그래프 1: 시도별 인구 막대 그래프

library(ggplot2)

region_latest |>
  mutate(region = fct_reorder(region, total_pop)) |>
  ggplot(aes(x = region, y = total_pop / 10000, fill = total_pop)) +
  geom_col() +
  coord_flip() +
  scale_fill_gradient(low = "#deebf7", high = "#2171b5") +
  labs(
    title = paste0(latest_year, "년 시도별 인구"),
    x     = "시도",
    y     = "인구 (만 명)",
    fill  = "인구"
  ) +
  theme_minimal(base_family = "AppleGothic") +
  theme(
    plot.title    = element_text(face = "bold", size = 14),
    legend.position = "none"
  )

그래프 2: 인구 피라미드

인구 피라미드는 연령별·성별 인구 분포를 좌우 대칭 막대 그래프로 표현합니다. 저출산·고령화 현황을 한눈에 볼 수 있습니다.

# 남성 인구를 음수로 변환하여 좌우 대칭 구조를 만듭니다
pyramid_data <- df |>
  filter(year == latest_year) |>
  group_by(age_group, gender) |>
  summarise(population = sum(population), .groups = "drop") |>
  mutate(
    pop_plot = if_else(gender == "남", -population, population)
  )

ggplot(pyramid_data, aes(x = age_group, y = pop_plot / 10000, fill = gender)) +
  geom_col(width = 0.8) +
  coord_flip() +
  scale_y_continuous(
    labels = function(x) paste0(abs(x), "만")
  ) +
  scale_fill_manual(values = c("남" = "#4393c3", "여" = "#d6604d")) +
  labs(
    title = paste0(latest_year, "년 전국 인구 피라미드"),
    x     = "연령대",
    y     = "인구 (만 명)",
    fill  = "성별"
  ) +
  theme_minimal(base_family = "AppleGothic") +
  theme(
    plot.title    = element_text(face = "bold", size = 14),
    legend.position = "bottom"
  )

그래프 3: 연도별 인구 트렌드

# 전국 연도별 총인구 변화를 시각화합니다
nation_trend <- df_region |>
  group_by(year) |>
  summarise(total = sum(total_pop), .groups = "drop")

ggplot(nation_trend, aes(x = year, y = total / 10000)) +
  geom_line(linewidth = 1.2, color = "#2171b5") +
  geom_point(size = 3, color = "#2171b5") +
  geom_label(
    aes(label = paste0(round(total / 10000, 0), "만")),
    vjust = -0.5,
    size  = 3,
    family = "AppleGothic"
  ) +
  scale_x_continuous(breaks = 2015:2023) +
  labs(
    title = "연도별 전국 총인구 변화",
    x     = "연도",
    y     = "인구 (만 명)"
  ) +
  theme_minimal(base_family = "AppleGothic") +
  theme(plot.title = element_text(face = "bold", size = 14))

그래프 4: 수도권 집중도 트렌드

metro_compare |>
  ggplot(aes(x = year, y = metro_ratio)) +
  geom_line(linewidth = 1.2, color = "#d73027") +
  geom_point(size = 3, color = "#d73027") +
  geom_text(
    aes(label = paste0(metro_ratio, "%")),
    vjust = -0.8,
    size  = 3,
    family = "AppleGothic"
  ) +
  scale_x_continuous(breaks = 2015:2023) +
  scale_y_continuous(limits = c(0, 100)) +
  labs(
    title = "연도별 수도권 인구 집중도",
    x     = "연도",
    y     = "수도권 인구 비율 (%)"
  ) +
  theme_minimal(base_family = "AppleGothic") +
  theme(plot.title = element_text(face = "bold", size = 14))

분석 요약

분석 결과를 숫자로 정리합니다. 보고서에 넣기 좋은 형태입니다.

# 주요 인사이트를 수치로 정리합니다
cat("=== 인구 분석 주요 수치 ===\n\n")

cat("분석 기준 연도:", latest_year, "\n")

cat("전국 총인구:", format(sum(region_latest$total_pop), big.mark = ","), "명\n")

top3 <- region_latest |> slice_head(n = 3) |> pull(region)
cat("인구 상위 3개 시도:", paste(top3, collapse = ", "), "\n")

metro_latest <- metro_compare |> filter(year == latest_year)
cat("수도권 집중도:", metro_latest$metro_ratio, "%\n")

인구 통계 분석을 통해 데이터의 규모가 커져도 같은 패턴이 반복된다는 것을 확인했습니다. 전처리로 데이터를 정제하고, 집계로 인사이트를 찾고, 시각화로 전달합니다. 다음 챕터에서는 이 결과들을 하나의 리포트로 정리하는 방법을 배웁니다.