代码
# 检查并安装所需的R包
required_packages <- c(
  "tidyverse",     # 数据处理和可视化
  "gtsummary",     # 统计表格生成
  "broom",         # 统计结果整理
  "corrplot",      # 相关矩阵可视化
  "medicaldata",   # 医学数据集
  "vcd"            # 可视化分类数据
)

# 检查并安装缺失的包
new_packages <- required_packages[!(required_packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# 加载所有包
invisible(lapply(required_packages, library, character.only = TRUE))

5.1 描述性统计

5.1.1 连续变量(年龄、BMI)

代码
# 创建示例数据
set.seed(123)
patient_data <- data.frame(
  age = rnorm(100, mean = 45, sd = 15),
  temp = rnorm(100, mean = 37, sd = 0.5),
  gender = factor(sample(c("男", "女"), 100, replace = TRUE)),
  group = factor(sample(c("对照组", "治疗组"), 100, replace = TRUE))
)

# 基本描述性统计
patient_summary <- patient_data %>%
  select(age, temp) %>%
  summary()

print(patient_summary)
      age             temp      
 Min.   :10.36   Min.   :35.97  
 1st Qu.:37.59   1st Qu.:36.60  
 Median :45.93   Median :36.89  
 Mean   :46.36   Mean   :36.95  
 3rd Qu.:55.38   3rd Qu.:37.23  
 Max.   :77.81   Max.   :38.62  
代码
# 使用gtsummary创建更专业的统计表
patient_data %>%
  select(age, temp, gender, group) %>%
  tbl_summary(
    by = group,
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = all_continuous() ~ 1
  ) %>%
  add_p() %>%
  add_n() %>%
  modify_header(label = "**变量**")
变量 N 对照组
N = 431
治疗组
N = 571
p-value
age 100 46.8 (15.4) 46.0 (12.4)
temp 100 36.9 (0.5) 37.0 (0.5)
gender 100


    男
24 (56%) 26 (46%)
    女
19 (44%) 31 (54%)
1 Mean (SD); n (%)

5.1.2 分类变量(性别、疾病分期)

代码
# 创建分类数据
cat_data <- data.frame(
  gender = factor(rep(c("男", "女"), each = 100)),
  treatment = factor(sample(c("治疗A", "治疗B", "治疗C"), 200, replace = TRUE)),
  response = factor(sample(c("有效", "无效"), 200, replace = TRUE, prob = c(0.7, 0.3)))
)

# 创建列联表
treatment_response <- table(cat_data$treatment, cat_data$response)
print("治疗方案与疗效的分布:")
[1] "治疗方案与疗效的分布:"
代码
print(treatment_response)
       
        无效 有效
  治疗A   15   53
  治疗B   19   40
  治疗C   21   52
代码
# 使用gtsummary创建分类变量摘要
cat_data %>%
  tbl_summary(
    by = treatment,
    missing = "no"
  ) %>%
  add_p() %>%
  modify_spanning_header(c("stat_1", "stat_2", "stat_3") ~ "治疗方案")
Characteristic
治疗方案
p-value
治疗A
N = 681
治疗B
N = 591
治疗C
N = 731
gender



    男 34 (50%) 33 (56%) 33 (45%)
    女 34 (50%) 26 (44%) 40 (55%)
response



    无效 15 (22%) 19 (32%) 21 (29%)
    有效 53 (78%) 40 (68%) 52 (71%)
1 n (%)

5.2 统计推断方法

5.2.1 t检验(药物组间比较)

代码
# 创建两组治疗数据
set.seed(123)
treatment_data <- data.frame(
  group = factor(rep(c("对照组", "治疗组"), each = 50)),
  value = c(
    rnorm(50, mean = 100, sd = 15),  # 对照组
    rnorm(50, mean = 110, sd = 15)   # 治疗组
  )
)

# 进行t检验
t_test_result <- t.test(value ~ group, data = treatment_data)

# 创建可视化比较
ggplot(treatment_data, aes(x = group, y = value, fill = group)) +
  geom_boxplot() +
  labs(title = "治疗效果比较",
       subtitle = paste("p值 =", format.pval(t_test_result$p.value, digits = 3)),
       x = "分组",
       y = "测量值") +
  theme_minimal()

5.2.2 卡方检验(发病率差异)

代码
# 创建分类数据
disease_data <- data.frame(
  gender = factor(rep(c("男", "女"), each = 100)),
  disease = factor(c(
    sample(c("是", "否"), 100, replace = TRUE, prob = c(0.3, 0.7)),
    sample(c("是", "否"), 100, replace = TRUE, prob = c(0.2, 0.8))
  ))
)

# 进行卡方检验
chi_result <- chisq.test(table(disease_data$gender, disease_data$disease))

# 创建马赛克图
library(vcd)
mosaic(~gender + disease, data = disease_data,
       main = "性别与疾病发生率关系",
       shade = TRUE)

代码
# 打印结果
print(chi_result)

    Pearson's Chi-squared test with Yates' continuity correction

data:  table(disease_data$gender, disease_data$disease)
X-squared = 2.2204, df = 1, p-value = 0.1362
代码
# 创建列联表可视化
disease_table <- table(disease_data$gender, disease_data$disease)
print("性别与疾病的分布:")
[1] "性别与疾病的分布:"
代码
print(disease_table)
    
     否 是
  男 71 29
  女 81 19

5.3 相关性分析

5.3.1 Pearson/Spearman相关

代码
# 创建示例数据进行相关性分析
set.seed(123)
cor_data <- data.frame(
  age = rnorm(100, mean = 45, sd = 15),
  sbp = rnorm(100, mean = 120, sd = 15),
  dbp = rnorm(100, mean = 80, sd = 10),
  bmi = rnorm(100, mean = 24, sd = 3)
)

# 计算相关系数
cor_test <- cor.test(cor_data$age, cor_data$sbp,
                     method = "spearman")

# 创建散点图
ggplot(cor_data, aes(x = age, y = sbp)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", color = "red") +
  labs(title = "年龄与收缩压的相关性分析",
       subtitle = paste("Spearman相关系数 =", 
                       round(cor_test$estimate, 3),
                       "\np值 =", format.pval(cor_test$p.value, digits = 3)),
       x = "年龄 (岁)",
       y = "收缩压 (mmHg)") +
  theme_minimal()

5.3.2 相关系数矩阵可视化

代码
library(corrplot)

# 计算相关系数矩阵
cor_matrix <- cor(cor_data, method = "spearman")

# 创建相关系数矩阵图
corrplot(cor_matrix,
         method = "color",
         type = "upper",
         addCoef.col = "black",
         tl.col = "black",
         tl.srt = 45,
         diag = FALSE)

练习
  1. 使用自己的数据集进行描述性统计分析
  2. 比较两组患者的实验室指标差异
  3. 分析疾病发生率在不同人群中的差异
  4. 探索多个临床指标之间的相关性

5.4 本章小结

在本章中,我们学习了:

  1. 如何进行基本的描述性统计分析
  2. t检验和卡方检验的应用
  3. 相关性分析方法
  4. 使用专业的统计表格和图形展示结果

下一章,我们将学习临床研究数据处理的高级技巧。