10 生物信息学入门

代码

# 设置CRAN镜像为中国合肥镜像
options(repos = c(CRAN = "https://mirrors.ustc.edu.cn/CRAN/"))

# 检查并安装BiocManager
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

# 设置Bioconductor镜像为中科大镜像
options(BioC_mirror = "https://mirrors.ustc.edu.cn/bioc")

# 检查并安装所需的R包
required_packages <- c(
  "tidyverse",           # 数据处理和可视化
  "pheatmap",            # 热图
  "igraph"               # 网络分析
)

# 检查并安装CRAN包
new_packages <- required_packages[!(required_packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages, dependencies = TRUE)

# 安装Bioconductor包
bioc_packages <- c(
  "GEOquery",            # GEO数据获取
  "limma",               # 差异表达分析
  "clusterProfiler",     # 通路富集分析
  "org.Hs.eg.db",        # 人类基因注释
  "enrichplot"           # 富集分析可视化
)

# 安装缺失的Bioconductor包
BiocManager::install(bioc_packages[!(bioc_packages %in% installed.packages()[,"Package"])], 
                    site_repository = "https://mirrors.ustc.edu.cn/bioc",
                    update = FALSE, 
                    ask = FALSE)

# 加载所需的包
invisible(lapply(c(required_packages, bioc_packages), library, character.only = TRUE))

10.1 基因表达数据分析

10.1.1 GEO数据库数据获取

代码

library(limma)
library(tidyverse)
library(pheatmap)
library(enrichplot)
library(clusterProfiler)
library(org.Hs.eg.db)

# 创建模拟表达数据
set.seed(123)
n_genes <- 1000
n_samples <- 20

# 生成表达矩阵
expr_matrix <- matrix(rnorm(n_genes * n_samples), nrow = n_genes, ncol = n_samples)
rownames(expr_matrix) <- paste0("gene", 1:n_genes)
colnames(expr_matrix) <- paste0("sample", 1:n_samples)

# 创建分组信息
groups <- factor(rep(c("Control", "Treatment"), each = n_samples/2))

# 展示数据维度
dim(expr_matrix)

[1] 1000   20

10.1.2 差异表达基因分析

代码

# 构建设计矩阵
design <- model.matrix(~0 + groups)
colnames(design) <- c("Control", "Treatment")

# 差异分析
fit <- lmFit(expr_matrix, design)
contrast.matrix <- makeContrasts(Treatment - Control, levels = design)
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)

# 获取差异基因
deg_results <- topTable(fit2, number = Inf, adjust.method = "BH")
deg_results$gene_id <- rownames(deg_results)

# 可视化火山图
ggplot(deg_results, aes(x = logFC, y = -log10(adj.P.Val))) +
  geom_point(aes(color = abs(logFC) > 1 & adj.P.Val < 0.05)) +
  geom_vline(xintercept = c(-1, 1), linetype = "dashed") +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
  scale_color_manual(values = c("grey", "red")) +
  labs(title = "差异表达基因火山图",
       x = "Log2 Fold Change",
       y = "-log10(Adjusted P-value)") +
  theme_minimal()

10.2 富集分析实践

10.2.1 GO/KEGG分析流程

代码

# 创建模拟基因ID（使用ENTREZ ID）
set.seed(123)
all_genes <- sample(keys(org.Hs.eg.db, keytype = "ENTREZID"), n_genes)
rownames(expr_matrix) <- all_genes

# 获取差异基因
deg_genes <- deg_results %>%
  filter(abs(logFC) > 1, adj.P.Val < 0.05) %>%
  pull(gene_id)

# GO富集分析
ego <- enrichGO(gene = deg_genes,
                universe = all_genes,
                OrgDb = org.Hs.eg.db,
                keyType = "ENTREZID",
                ont = "BP",
                pAdjustMethod = "BH",
                pvalueCutoff = 0.05,
                qvalueCutoff = 0.05)

# 可视化GO富集结果
if(!is.null(ego) && nrow(ego) > 0) {
  dotplot(ego, showCategory = 15) +
    ggtitle("GO Biological Process Enrichment")
} else {
  message("No significant GO terms found")
}

# 创建热图
if(length(deg_genes) > 0) {
  top_genes <- head(deg_genes, 50)
  heatmap_data <- expr_matrix[top_genes, ]
  
  # 创建分组注释
  annotation_col <- data.frame(
    Group = groups,
    row.names = colnames(expr_matrix)
  )
  
  # 绘制热图
  pheatmap(heatmap_data,
           scale = "row",
           show_rownames = TRUE,
           annotation_col = annotation_col,
           main = "Top 50 差异表达基因热图")
} else {
  message("No significant genes found for heatmap")
}

10.2.2 结果可视化（气泡图）

代码

# 如果有显著的富集结果，创建气泡图
if(!is.null(ego) && nrow(ego) > 0) {
  # 获取前10个富集通路
  top_pathways <- as.data.frame(ego) %>%
    slice_head(n = 10)
  
  # 创建富集分析气泡图
  ggplot(top_pathways, 
         aes(x = Count, y = reorder(Description, Count))) +
    geom_point(aes(size = Count, color = p.adjust)) +
    scale_color_gradient(low = "red", high = "blue") +
    labs(title = "GO富集分析结果",
         x = "基因数量",
         y = "GO Term") +
    theme_minimal() +
    theme(axis.text.y = element_text(size = 8))
} else {
  message("No significant pathways for visualization")
}

# 创建基因表达模式图
if(length(deg_genes) > 0) {
  # 选择表达变化最显著的基因
  top_varied_genes <- deg_results %>%
    arrange(desc(abs(logFC))) %>%
    slice_head(n = 20) %>%
    pull(gene_id)
  
  # 准备数据
  expression_data <- expr_matrix[top_varied_genes, ] %>%
    scale() %>%  # 对每个基因进行z-score标准化
    t() %>%      # 转置矩阵
    as.data.frame() %>%
    mutate(Sample = rownames(.)) %>%
    gather(Gene, Expression, -Sample)
  
  # 创建表达模式图
  ggplot(expression_data, aes(x = Sample, y = Gene)) +
    geom_tile(aes(fill = Expression)) +
    scale_fill_gradient2(low = "blue", mid = "white", high = "red") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
          axis.text.y = element_text(size = 8)) +
    labs(title = "Top 20 差异表达基因表达模式",
         x = "样本",
         y = "基因")
} else {
  message("No significant genes for visualization")
}

10.3 蛋白质组学基础

10.3.1 质谱数据预处理

代码

# 注：以下代码仅作演示，需要实际的质谱数据才能运行
library(MSnbase)
library(MALDIquant)

# 读取质谱数据
raw_data <- readMSData("ms_data.mzML", mode = "onDisk")

# 数据预处理流程
processed_data <- raw_data %>%
  # 降噪
  removeBaseline() %>%
  # 峰对齐
  alignRt() %>%
  # 峰检测
  findPeaks() %>%
  # 定量
  quantify()

10.3.2 蛋白质互作网络

代码

library(igraph)

# 创建示例蛋白质互作数据
set.seed(123)
interactions <- data.frame(
  protein1 = sample(LETTERS[1:10], 20, replace = TRUE),
  protein2 = sample(LETTERS[1:10], 20, replace = TRUE)
)

# 构建网络
network <- graph_from_data_frame(interactions, directed = FALSE)

# 计算网络特征
degree <- degree(network)
betweenness <- betweenness(network)

# 可视化网络
plot(network,
     vertex.size = degree * 5,
     vertex.label.cex = 0.8,
     vertex.color = "lightblue",
     main = "蛋白质互作网络")

练习

下载并分析一个GEO数据集
进行差异表达分析和通路富集
创建基因表达热图
构建蛋白质互作网络

10.4 本章小结

在本章中，我们学习了：

如何获取和处理基因表达数据
差异表达分析的基本流程
通路富集分析方法
蛋白质组学数据分析基础

下一章，我们将通过综合案例来应用所学知识。

# 生物信息学入门 {#sec-bioinformatics} ```{r} #| label: check-packages #| message: false # 设置CRAN镜像为中国合肥镜像 options(repos = c(CRAN = "https://mirrors.ustc.edu.cn/CRAN/")) # 检查并安装BiocManager if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") # 设置Bioconductor镜像为中科大镜像 options(BioC_mirror = "https://mirrors.ustc.edu.cn/bioc") # 检查并安装所需的R包 required_packages <- c( "tidyverse", # 数据处理和可视化 "pheatmap", # 热图 "igraph" # 网络分析 ) # 检查并安装CRAN包 new_packages <- required_packages[!(required_packages %in% installed.packages()[,"Package"])] if(length(new_packages)) install.packages(new_packages, dependencies = TRUE) # 安装Bioconductor包 bioc_packages <- c( "GEOquery", # GEO数据获取 "limma", # 差异表达分析 "clusterProfiler", # 通路富集分析 "org.Hs.eg.db", # 人类基因注释 "enrichplot" # 富集分析可视化 ) # 安装缺失的Bioconductor包 BiocManager::install(bioc_packages[!(bioc_packages %in% installed.packages()[,"Package"])], site_repository = "https://mirrors.ustc.edu.cn/bioc", update = FALSE, ask = FALSE) # 加载所需的包 invisible(lapply(c(required_packages, bioc_packages), library, character.only = TRUE)) ``` ## 基因表达数据分析 {#sec-gene-expression} ### GEO数据库数据获取 {#sec-geo-data} ```{r} #| label: setup #| message: false library(limma) library(tidyverse) library(pheatmap) library(enrichplot) library(clusterProfiler) library(org.Hs.eg.db) # 创建模拟表达数据 set.seed(123) n_genes <- 1000 n_samples <- 20 # 生成表达矩阵 expr_matrix <- matrix(rnorm(n_genes * n_samples), nrow = n_genes, ncol = n_samples) rownames(expr_matrix) <- paste0("gene", 1:n_genes) colnames(expr_matrix) <- paste0("sample", 1:n_samples) # 创建分组信息 groups <- factor(rep(c("Control", "Treatment"), each = n_samples/2)) # 展示数据维度 dim(expr_matrix) ``` ### 差异表达基因分析 {#sec-deg-analysis} ```{r} #| label: deg-analysis # 构建设计矩阵 design <- model.matrix(~0 + groups) colnames(design) <- c("Control", "Treatment") # 差异分析 fit <- lmFit(expr_matrix, design) contrast.matrix <- makeContrasts(Treatment - Control, levels = design) fit2 <- contrasts.fit(fit, contrast.matrix) fit2 <- eBayes(fit2) # 获取差异基因 deg_results <- topTable(fit2, number = Inf, adjust.method = "BH") deg_results$gene_id <- rownames(deg_results) # 可视化火山图 ggplot(deg_results, aes(x = logFC, y = -log10(adj.P.Val))) + geom_point(aes(color = abs(logFC) > 1 & adj.P.Val < 0.05)) + geom_vline(xintercept = c(-1, 1), linetype = "dashed") + geom_hline(yintercept = -log10(0.05), linetype = "dashed") + scale_color_manual(values = c("grey", "red")) + labs(title = "差异表达基因火山图", x = "Log2 Fold Change", y = "-log10(Adjusted P-value)") + theme_minimal() ``` ## 富集分析实践 {#sec-enrichment} ### GO/KEGG分析流程 {#sec-pathway-analysis} ```{r} #| label: enrichment-analysis # 创建模拟基因ID（使用ENTREZ ID） set.seed(123) all_genes <- sample(keys(org.Hs.eg.db, keytype = "ENTREZID"), n_genes) rownames(expr_matrix) <- all_genes # 获取差异基因 deg_genes <- deg_results %>% filter(abs(logFC) > 1, adj.P.Val < 0.05) %>% pull(gene_id) # GO富集分析 ego <- enrichGO(gene = deg_genes, universe = all_genes, OrgDb = org.Hs.eg.db, keyType = "ENTREZID", ont = "BP", pAdjustMethod = "BH", pvalueCutoff = 0.05, qvalueCutoff = 0.05) # 可视化GO富集结果 if(!is.null(ego) && nrow(ego) > 0) { dotplot(ego, showCategory = 15) + ggtitle("GO Biological Process Enrichment") } else { message("No significant GO terms found") } # 创建热图 if(length(deg_genes) > 0) { top_genes <- head(deg_genes, 50) heatmap_data <- expr_matrix[top_genes, ] # 创建分组注释 annotation_col <- data.frame( Group = groups, row.names = colnames(expr_matrix) ) # 绘制热图 pheatmap(heatmap_data, scale = "row", show_rownames = TRUE, annotation_col = annotation_col, main = "Top 50 差异表达基因热图") } else { message("No significant genes found for heatmap") } ``` ### 结果可视化（气泡图） {#sec-visualization} ```{r} #| label: pathway-viz # 如果有显著的富集结果，创建气泡图 if(!is.null(ego) && nrow(ego) > 0) { # 获取前10个富集通路 top_pathways <- as.data.frame(ego) %>% slice_head(n = 10) # 创建富集分析气泡图 ggplot(top_pathways, aes(x = Count, y = reorder(Description, Count))) + geom_point(aes(size = Count, color = p.adjust)) + scale_color_gradient(low = "red", high = "blue") + labs(title = "GO富集分析结果", x = "基因数量", y = "GO Term") + theme_minimal() + theme(axis.text.y = element_text(size = 8)) } else { message("No significant pathways for visualization") } # 创建基因表达模式图 if(length(deg_genes) > 0) { # 选择表达变化最显著的基因 top_varied_genes <- deg_results %>% arrange(desc(abs(logFC))) %>% slice_head(n = 20) %>% pull(gene_id) # 准备数据 expression_data <- expr_matrix[top_varied_genes, ] %>% scale() %>% # 对每个基因进行z-score标准化 t() %>% # 转置矩阵 as.data.frame() %>% mutate(Sample = rownames(.)) %>% gather(Gene, Expression, -Sample) # 创建表达模式图 ggplot(expression_data, aes(x = Sample, y = Gene)) + geom_tile(aes(fill = Expression)) + scale_fill_gradient2(low = "blue", mid = "white", high = "red") + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.text.y = element_text(size = 8)) + labs(title = "Top 20 差异表达基因表达模式", x = "样本", y = "基因") } else { message("No significant genes for visualization") } ``` ## 蛋白质组学基础 {#sec-proteomics} ### 质谱数据预处理 {#sec-ms-preprocessing} ```{r} #| label: proteomics-demo #| eval: false # 注：以下代码仅作演示，需要实际的质谱数据才能运行 library(MSnbase) library(MALDIquant) # 读取质谱数据 raw_data <- readMSData("ms_data.mzML", mode = "onDisk") # 数据预处理流程 processed_data <- raw_data %>% # 降噪 removeBaseline() %>% # 峰对齐 alignRt() %>% # 峰检测 findPeaks() %>% # 定量 quantify() ``` ### 蛋白质互作网络 {#sec-protein-network} ```{r} #| label: protein-network library(igraph) # 创建示例蛋白质互作数据 set.seed(123) interactions <- data.frame( protein1 = sample(LETTERS[1:10], 20, replace = TRUE), protein2 = sample(LETTERS[1:10], 20, replace = TRUE) ) # 构建网络 network <- graph_from_data_frame(interactions, directed = FALSE) # 计算网络特征 degree <- degree(network) betweenness <- betweenness(network) # 可视化网络 plot(network, vertex.size = degree * 5, vertex.label.cex = 0.8, vertex.color = "lightblue", main = "蛋白质互作网络") ``` ::: callout-tip ## 练习 1. 下载并分析一个GEO数据集 2. 进行差异表达分析和通路富集 3. 创建基因表达热图 4. 构建蛋白质互作网络 ::: ## 本章小结在本章中，我们学习了： 1. 如何获取和处理基因表达数据 2. 差异表达分析的基本流程 3. 通路富集分析方法 4. 蛋白质组学数据分析基础下一章，我们将通过综合案例来应用所学知识。