####################################################################
# Name:
# Lab 9: Identification of differentially expressed genes
# Turn in a Notebook that answers the questions below
####################################################################
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(UCSCXenaTools) # needed to retreive data
## =========================================================================================
## UCSCXenaTools version 1.4.5
## Project URL: https://github.com/ropensci/UCSCXenaTools
## Usages: https://cran.r-project.org/web/packages/UCSCXenaTools/vignettes/USCSXenaTools.html
##
## If you use it in published research, please cite:
## Wang et al., (2019). The UCSCXenaTools R package: a toolkit for accessing genomics data
## from UCSC Xena platform, from cancer multi-omics to single-cell RNA-seq.
## Journal of Open Source Software, 4(40), 1627, https://doi.org/10.21105/joss.01627
## =========================================================================================
## --Enjoy it--
library(edgeR) # needed for processing, such as TMM
## Loading required package: limma
library(limma) # needed to find DE probes
########################################################
# We start by retreiving and processing the data
# This code generates the following objects:
# logCPM - the TMM normalized expression data on the
# log CPM scale
# Y - the clinical information
# probeMap - the probeMap data
# logCPM and Y are aligned so that column 'i' of logCPM
# contains the expression data for the individual with
# clinical data in row 'i' of Y
########################################################
##########################################
# Setup step 1: Retrieve the data
##########################################
data(XenaData)
# limit to desired cohort
blca <- XenaData %>% filter(XenaCohorts == 'GDC TCGA Bladder Cancer (BLCA)')
# Get the phenotype / clinical data
cli_query = blca %>%
filter(Label == "Phenotype") %>% # select clinical dataset
XenaGenerate() %>% # generate a XenaHub object
XenaQuery() %>% # generate the query
XenaDownload() # download the data
## This will check url status, please be patient.
## All downloaded files will under directory /var/folders/09/wnhgpdlj4nq0h2yqc5hv88rh17kp1y/T//RtmpK8UrYc.
## The 'trans_slash' option is FALSE, keep same directory structure as Xena.
## Creating directories for datasets...
## Downloading TCGA-BLCA.GDC_phenotype.tsv.gz
# prepare (load) the data into R
blca_pheno <- XenaPrepare(cli_query)
# Get the RNA-seq data, including the "probe map"
cli_query <- blca %>% filter(Label == 'HTSeq - Counts') %>%
XenaGenerate() %>% # generate a XenaHub object
XenaQuery() %>%
XenaDownload(download_probeMap = TRUE)
## This will check url status, please be patient.
## Check ProbeMap urls of datasets.
## All downloaded files will under directory /var/folders/09/wnhgpdlj4nq0h2yqc5hv88rh17kp1y/T//RtmpK8UrYc.
## The 'trans_slash' option is FALSE, keep same directory structure as Xena.
## Creating directories for datasets...
## Downloading TCGA-BLCA.htseq_counts.tsv.gz
## Downloading gencode.v22.annotation.gene.probeMap
# prepare (load) the data into R
blca_counts <- XenaPrepare(cli_query)
##########################################
# Setup step 2: Data pre-processing
##########################################
X <- data.frame(blca_counts$TCGA.BLCA.htseq_counts.tsv.gz)
rownames(X) <- X$Ensembl_ID
X <- X[,-1] # remove the probe name column
# probeMap = probe names
probeMap <- blca_counts$gencode.v22.annotation.gene.probeMap
# Y = pheno data
Y <- blca_pheno
# 'change '.' to '-' so sample ID format is consistent
colnames(X) <- gsub('\\.', '-', colnames(X))
# Keep only the '01A' tumor samples
g <- grep('01A$', colnames(X))
X <- X[,g]
# match expression data to clinical data
common_samples <- intersect(colnames(X), Y$submitter_id.samples)
mx <- match(common_samples, colnames(X))
my <- match(common_samples, Y$submitter_id.samples)
X <- X[,mx]
Y <- Y[my,]
# Make sure that the samples match -- if they don't, this will produce an error
stopifnot(all(colnames(X) == Y$submitter_id.samples))
#############################################
# Setup step 3: Process the expression data
#############################################
# convert from log2(count + 1) to count data
X <- round(2**X - 1)
# remove genes with low counts
dge <- DGEList(counts=X)
keep <- filterByExpr(dge,min.prop = .10 )
## Warning in filterByExpr.DGEList(dge, min.prop = 0.1): All samples appear to
## belong to the same group.
dge <- dge[keep,,keep.lib.sizes=FALSE]
# apply TMM normalization, which computes the normalization
# factors. The actual normalization is done in a later step
dge <- calcNormFactors(dge, method = "TMM")
# Calculate the log CPM values, using the normalization factors;
# 3 counts are added to each observation to prevent log 0 values
logCPM <- cpm(dge, log = TRUE, prior.count = 3)
########################################################################
# Questions: Use the logCPM, probeMap, and Y objects to answer the
# questions
########################################################################
Question 1 -- [1 / 4 points] ❌You want the mean of
dflogCPM["ENSG00000215704.8",]
# 1) Find the mean expression of the probe "ENSG00000215704.8"
dflogCPM <- data.frame(logCPM)
val <- dflogCPM["ENSG00000215704.8",]
rowSums(val)
## ENSG00000215704.8
## -1255.965
mean(filter(probeMap, id=="ENSG00000215704.8")$chromStart, filter(probeMap, id=="ENSG00000215704.8")$chromEnd)
## [1] 15465909
Question 2 -- [0 / 10 points] ❌
# 2) Use ggplot to construct side-by-side boxplots for the expression
# of "ENSG00000215704.8" between males and females (do not worry about
# the FC and p-value for this comparison). The title of the graph should
# be the probe name and the y-axis should be labeled "log CPM". Based
# on the graph, does it appear that this probe is differentially
# expressed?
grade <- Y$neoplasm_histologic_grade
grade[is.na(grade)] <- "unknown"
##########################################################################
# Beginning with question 3, we will identify probes that are
# differentially expressed between "High Grade" and "Low Grade" tumors.
# This data has some missing values, which we replace with "unknown"
# because missing values are not allowed in the design matrix.
# High grade tumors grow more quickly than low grade tumors, and are
# associated with poorer outcomes
##########################################################################
grade <- Y$neoplasm_histologic_grade
grade[is.na(grade)] <- "unknown"
Question 3 -- [4 / 20 points] ❌You have the right idea but need to use the write syntax and commands. See the limma script and lecture and let me know if you have any questions.
# 3) Use limma to find the top 10 differentially expressed probes between
# low and high grade tumors, and confirm that the top probe is
# "ENSG00000198670.10"
# (a) construct design matrix
design_matrix <- model.matrix(~grade)
# (b) fit the linear model to each row of the expression matrix
fit <- lmFit(Y$age_at_initial_pathologic_diagnosis)
# (c) specify and fit the contrasts
#makeContrasts(fit, levels = c("design"))
# (d) apply the 'eBayes' step to calculate moderated t statistics
fit <- eBayes(fit)
# (e) find the top 10 differentially expressed probes
topTable(fit)
## logFC AveExpr t P.Value adj.P.Val B
## 1 68.04691 68.04691 129.452 0 0 644.5964
# (f) confirm that top probe is "ENSG00000198670.10"
Question 4 -- [0 / 10 points] ❌
# 4) Construct side-by-side boxplots (using ggplot) comparing the
# expression of the top probe across high grade and low grade samples.
# Filter out the "unknown" samples so they do not appear in the
# boxplot. Your graph should include the probe name and the FC
# You may specifically look at the probe "ENSG00000198670.10"
# if your previous answer is not correct.
Question 5 -- [0 / 10 points] ❌
# 5) Using the probes below (which are the top 10 probes),
# generate a heatmap that color codes the columns as follows:
# low grade = "lightgreen"; high grade = "magenta";
# unknown = "black". It is typical to use yellow/blue for the
# expression data, but you may change these colors if you prefer
# Note that factor levels are in alphabetical order. Your heatmap
# should also display the gene names instead of the probe names.
probes <-
c("ENSG00000198670.10", "ENSG00000065911.10", "ENSG00000261713.5",
"ENSG00000136052.8", "ENSG00000108960.6", "ENSG00000269986.1",
"ENSG00000276972.1", "ENSG00000169504.13", "ENSG00000086300.14",
"ENSG00000271579.1")
Question 6 -- [0 / 5 points] ❌
# 6) Use the top table function and find the number of probes that
# are differentially expressed with an FDR of < 5%.
fit <- lmFit(dflogCPM[probes,])
Question 7 -- [0 / 5 points] ❌
# 7) Find the FC and p-value of the probe "ENSG00000163564.13".
# Hint: you will need to re-run the topTable function to get
# results for all probes
fit <- eBayes(fit)
topTable(fit)
## logFC AveExpr t P.Value adj.P.Val
## ENSG00000065911.10 5.236495 5.236495 95.98189 2.374820e-284 2.374820e-283
## ENSG00000169504.13 5.949280 5.949280 87.34112 3.095393e-268 1.547696e-267
## ENSG00000276972.1 -3.666328 -3.666328 -61.17306 4.701267e-209 1.567089e-208
## ENSG00000108960.6 3.316781 3.316781 50.01326 2.959998e-177 7.399996e-177
## ENSG00000269986.1 -3.344187 -3.344187 -45.81422 6.212218e-164 1.242444e-163
## ENSG00000086300.14 2.979906 2.979906 44.66047 3.971997e-160 6.619995e-160
## ENSG00000198670.10 -2.678113 -2.678113 -28.44586 2.594934e-99 3.707049e-99
## ENSG00000261713.5 -3.065924 -3.065924 -26.34231 2.013703e-90 2.517128e-90
## ENSG00000271579.1 -2.555446 -2.555446 -23.99823 2.657628e-80 2.952920e-80
## ENSG00000136052.8 1.800528 1.800528 20.74156 5.509087e-66 5.509087e-66
## B
## ENSG00000065911.10 641.3091
## ENSG00000169504.13 604.3769
## ENSG00000276972.1 468.4404
## ENSG00000108960.6 395.2711
## ENSG00000269986.1 364.6014
## ENSG00000086300.14 355.8385
## ENSG00000198670.10 215.7741
## ENSG00000261713.5 195.3010
## ENSG00000271579.1 171.9973
## ENSG00000136052.8 139.0420
Question 8 -- [5 / 5 points] ✅
# 8) The gene TREX1 (three prime repair exonuclease 1) plays a role
# in DNA repair. What probe or probes correspond to TREX1?
probesTREX1 <- filter(probeMap, gene == "TREX1")
probesTREX1$id
## [1] "ENSG00000280804.1" "ENSG00000213689.8"
# Note: Your results from question (6) provide a candidate set of genes
# that could be used to better predict risk for patients with bladder
# cancer, and can also provide therapeutic targets (genes that could be
# targeted for better bladder cancer treatment)