Total grade: 10/69 (14%)

Make sure to go over the limma script and lectures, as most of this follows those examples. If you have any questions, let me know.

####################################################################
# Name:
# Lab 9: Identification of differentially expressed genes

# Turn in a Notebook that answers the questions below
####################################################################

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(UCSCXenaTools) # needed to retreive data
## =========================================================================================
## UCSCXenaTools version 1.4.5
## Project URL: https://github.com/ropensci/UCSCXenaTools
## Usages: https://cran.r-project.org/web/packages/UCSCXenaTools/vignettes/USCSXenaTools.html
## 
## If you use it in published research, please cite:
## Wang et al., (2019). The UCSCXenaTools R package: a toolkit for accessing genomics data
##   from UCSC Xena platform, from cancer multi-omics to single-cell RNA-seq.
##   Journal of Open Source Software, 4(40), 1627, https://doi.org/10.21105/joss.01627
## =========================================================================================
##                               --Enjoy it--
library(edgeR) # needed for processing, such as TMM
## Loading required package: limma
library(limma) # needed to find DE probes

########################################################
# We start by retreiving and processing the data
# This code generates the following objects:
#  logCPM - the TMM normalized expression data on the 
#              log CPM scale
#  Y - the clinical information
#  probeMap - the probeMap data

# logCPM and Y are aligned so that column 'i' of logCPM 
# contains the expression data for the individual with
# clinical data in row 'i' of Y
########################################################


##########################################
# Setup step 1: Retrieve the data
##########################################

data(XenaData)

# limit to desired cohort
blca <- XenaData %>% filter(XenaCohorts == 'GDC TCGA Bladder Cancer (BLCA)')

# Get the phenotype / clinical data
cli_query = blca %>%
  filter(Label == "Phenotype") %>%  # select clinical dataset
  XenaGenerate() %>%  # generate a XenaHub object
  XenaQuery() %>%     # generate the query
  XenaDownload()      # download the data
## This will check url status, please be patient.
## All downloaded files will under directory /var/folders/09/wnhgpdlj4nq0h2yqc5hv88rh17kp1y/T//RtmpK8UrYc.
## The 'trans_slash' option is FALSE, keep same directory structure as Xena.
## Creating directories for datasets...
## Downloading TCGA-BLCA.GDC_phenotype.tsv.gz
# prepare (load) the data into R
blca_pheno <- XenaPrepare(cli_query)

# Get the RNA-seq data, including the "probe map"
cli_query <- blca %>% filter(Label == 'HTSeq - Counts') %>%
  XenaGenerate() %>%  # generate a XenaHub object
  XenaQuery() %>%
  XenaDownload(download_probeMap = TRUE)
## This will check url status, please be patient.
## Check ProbeMap urls of datasets.
## All downloaded files will under directory /var/folders/09/wnhgpdlj4nq0h2yqc5hv88rh17kp1y/T//RtmpK8UrYc.
## The 'trans_slash' option is FALSE, keep same directory structure as Xena.
## Creating directories for datasets...
## Downloading TCGA-BLCA.htseq_counts.tsv.gz
## Downloading gencode.v22.annotation.gene.probeMap
# prepare (load) the data into R
blca_counts <- XenaPrepare(cli_query)



##########################################
# Setup step 2: Data pre-processing
##########################################

X <- data.frame(blca_counts$TCGA.BLCA.htseq_counts.tsv.gz)
rownames(X) <- X$Ensembl_ID
X <- X[,-1]  # remove the probe name column

# probeMap = probe names
probeMap <- blca_counts$gencode.v22.annotation.gene.probeMap

# Y = pheno data
Y <- blca_pheno


# 'change '.' to '-' so sample ID format is consistent
colnames(X) <- gsub('\\.', '-', colnames(X))

# Keep only the '01A' tumor samples
g <- grep('01A$', colnames(X))
X <- X[,g]


# match expression data to clinical data
common_samples <- intersect(colnames(X), Y$submitter_id.samples)
mx <- match(common_samples, colnames(X))
my <- match(common_samples, Y$submitter_id.samples)

X <- X[,mx]
Y <- Y[my,]

# Make sure that the samples match -- if they don't, this will produce an error
stopifnot(all(colnames(X) == Y$submitter_id.samples))


#############################################
# Setup step 3: Process the expression data
#############################################

# convert from log2(count + 1) to count data
X <- round(2**X - 1)

# remove genes with low counts
dge <- DGEList(counts=X)
keep <- filterByExpr(dge,min.prop = .10 )
## Warning in filterByExpr.DGEList(dge, min.prop = 0.1): All samples appear to
## belong to the same group.
dge <- dge[keep,,keep.lib.sizes=FALSE]

# apply TMM normalization, which computes the normalization 
# factors. The actual normalization is done in a later step
dge <- calcNormFactors(dge, method = "TMM")

# Calculate the log CPM values, using the normalization factors;
# 3 counts are added to each observation to prevent log 0 values
logCPM <- cpm(dge, log = TRUE, prior.count = 3)


########################################################################
# Questions: Use the logCPM, probeMap, and Y objects to answer the
# questions
########################################################################

Question 1 -- [1 / 4 points] ❌

You want the mean of
dflogCPM["ENSG00000215704.8",]


# 1) Find the mean expression of the probe "ENSG00000215704.8" dflogCPM <- data.frame(logCPM) val <- dflogCPM["ENSG00000215704.8",] rowSums(val)
## ENSG00000215704.8 
##         -1255.965
mean(filter(probeMap, id=="ENSG00000215704.8")$chromStart, filter(probeMap, id=="ENSG00000215704.8")$chromEnd)
## [1] 15465909
Question 2 -- [0 / 10 points] ❌

# 2) Use ggplot to construct side-by-side boxplots for the expression # of "ENSG00000215704.8" between males and females (do not worry about # the FC and p-value for this comparison). The title of the graph should # be the probe name and the y-axis should be labeled "log CPM". Based # on the graph, does it appear that this probe is differentially # expressed? grade <- Y$neoplasm_histologic_grade grade[is.na(grade)] <- "unknown" ########################################################################## # Beginning with question 3, we will identify probes that are # differentially expressed between "High Grade" and "Low Grade" tumors. # This data has some missing values, which we replace with "unknown" # because missing values are not allowed in the design matrix. # High grade tumors grow more quickly than low grade tumors, and are # associated with poorer outcomes ########################################################################## grade <- Y$neoplasm_histologic_grade grade[is.na(grade)] <- "unknown"
Question 3 -- [4 / 20 points] ❌

You have the right idea but need to use the write syntax and commands. See the limma script and lecture and let me know if you have any questions.

# 3) Use limma to find the top 10 differentially expressed probes between # low and high grade tumors, and confirm that the top probe is # "ENSG00000198670.10" # (a) construct design matrix design_matrix <- model.matrix(~grade) # (b) fit the linear model to each row of the expression matrix fit <- lmFit(Y$age_at_initial_pathologic_diagnosis) # (c) specify and fit the contrasts #makeContrasts(fit, levels = c("design")) # (d) apply the 'eBayes' step to calculate moderated t statistics fit <- eBayes(fit) # (e) find the top 10 differentially expressed probes topTable(fit)
##      logFC  AveExpr       t P.Value adj.P.Val        B
## 1 68.04691 68.04691 129.452       0         0 644.5964
# (f) confirm that top probe is "ENSG00000198670.10"


Question 4 -- [0 / 10 points] ❌

# 4) Construct side-by-side boxplots (using ggplot) comparing the # expression of the top probe across high grade and low grade samples. # Filter out the "unknown" samples so they do not appear in the # boxplot. Your graph should include the probe name and the FC # You may specifically look at the probe "ENSG00000198670.10" # if your previous answer is not correct.
Question 5 -- [0 / 10 points] ❌

# 5) Using the probes below (which are the top 10 probes), # generate a heatmap that color codes the columns as follows: # low grade = "lightgreen"; high grade = "magenta"; # unknown = "black". It is typical to use yellow/blue for the # expression data, but you may change these colors if you prefer # Note that factor levels are in alphabetical order. Your heatmap # should also display the gene names instead of the probe names. probes <- c("ENSG00000198670.10", "ENSG00000065911.10", "ENSG00000261713.5", "ENSG00000136052.8", "ENSG00000108960.6", "ENSG00000269986.1", "ENSG00000276972.1", "ENSG00000169504.13", "ENSG00000086300.14", "ENSG00000271579.1")
Question 6 -- [0 / 5 points] ❌

# 6) Use the top table function and find the number of probes that # are differentially expressed with an FDR of < 5%. fit <- lmFit(dflogCPM[probes,])
Question 7 -- [0 / 5 points] ❌

# 7) Find the FC and p-value of the probe "ENSG00000163564.13". # Hint: you will need to re-run the topTable function to get # results for all probes fit <- eBayes(fit) topTable(fit)
##                        logFC   AveExpr         t       P.Value     adj.P.Val
## ENSG00000065911.10  5.236495  5.236495  95.98189 2.374820e-284 2.374820e-283
## ENSG00000169504.13  5.949280  5.949280  87.34112 3.095393e-268 1.547696e-267
## ENSG00000276972.1  -3.666328 -3.666328 -61.17306 4.701267e-209 1.567089e-208
## ENSG00000108960.6   3.316781  3.316781  50.01326 2.959998e-177 7.399996e-177
## ENSG00000269986.1  -3.344187 -3.344187 -45.81422 6.212218e-164 1.242444e-163
## ENSG00000086300.14  2.979906  2.979906  44.66047 3.971997e-160 6.619995e-160
## ENSG00000198670.10 -2.678113 -2.678113 -28.44586  2.594934e-99  3.707049e-99
## ENSG00000261713.5  -3.065924 -3.065924 -26.34231  2.013703e-90  2.517128e-90
## ENSG00000271579.1  -2.555446 -2.555446 -23.99823  2.657628e-80  2.952920e-80
## ENSG00000136052.8   1.800528  1.800528  20.74156  5.509087e-66  5.509087e-66
##                           B
## ENSG00000065911.10 641.3091
## ENSG00000169504.13 604.3769
## ENSG00000276972.1  468.4404
## ENSG00000108960.6  395.2711
## ENSG00000269986.1  364.6014
## ENSG00000086300.14 355.8385
## ENSG00000198670.10 215.7741
## ENSG00000261713.5  195.3010
## ENSG00000271579.1  171.9973
## ENSG00000136052.8  139.0420
Question 8 -- [5 / 5 points] ✅

# 8) The gene TREX1 (three prime repair exonuclease 1) plays a role # in DNA repair. What probe or probes correspond to TREX1? probesTREX1 <- filter(probeMap, gene == "TREX1") probesTREX1$id
## [1] "ENSG00000280804.1" "ENSG00000213689.8"
# Note: Your results from question (6) provide a candidate set of genes
# that could be used to better predict risk for patients with bladder
# cancer, and can also provide therapeutic targets (genes that could be
# targeted for better bladder cancer treatment)