Skip to content

Uploading Single Cell Sequencing data

JPReceveur edited this page Aug 18, 2023 · 4 revisions

A. Prepare the files using the raw matrix from 10x platform

  1. you can locate the files from the 10x cellranger output folder under raw_feature_bc_matrix folder, there are usually 3 compressed files, barcodes.tsv.gz features.tsv.gz matrix.mtx.gz

    There are 3 columns in the features.tsv.gz, please only keep the first 2 columns

zcat features.tsv.gz |cut -f1,2 > genes.tsv
Decompress all the the zip files
gunzip *.gz

renames features.tsv to genes.tsv:

mv features.tsv genes.tsv
# there should be 3 files for next step (MUST BE NAMED AS FOLLOWING): barcodes.tsv genes.tsv matrix.mtx

Compress all the files
tar -cf upload1.tar *.tsv *.mtx (windows)

COPYFILE_DISABLE=1 tar -cf upload1.tar *.tsv *.mtx (mac)

B. Prepare the files based on seurat processed output (all the command below are R script)

suppressMessages(library("Seurat"));suppressMessages(library(dplyr)); suppressMessages(library(biomaRt))

metadata <- function(object,group){
  meta <- as.data.frame(cbind(row.names(object@meta.data), object@meta.data, Embeddings(object[["umap"]])))
  colnames(meta)[1] <- "observations"
  return(meta)
}

Path to your RDS file of Seurat analysis result

SeuratObj<-readRDS("SeuratObject.rds")



##create metadata sheet using function above

obs <- metadata(SeuratObj)

t1.m=data.frame(GetAssayData(object = SeuratObj))

#add replicate column based on cell_type
obs <- obs %>% group_by(cell_type) %>% mutate(replicate = row_number())

##add cluster_label to able the primary analysis function
obs$cluster_label=obs$cell_type

##if you have the hexcode for each cell type to color, name that column as "cell_type_colors"

Make sure the cluster names are not in numeric format

In order to enable the “primary analysis” function of single cell workbench, one of the column names MUST be in this list. ['cluster', 'cell_type', 'cluster_label', 'subclass_label', 'joint_cluster_round4_annot']. As long as one of the tag is found in this list, the primary analysis will use that tag to display the cluster to compare with.

Don’t put “,” or “ “ in the column name, otherwise, the multi-genes display function will have problems.

Prepare annotation

t1.m=data.frame(GetAssayData(object = SeuratObj))

mart = useMart( 'ensembl' )
datasets <- listDatasets(mart)
mart = useDataset( 'mmusculus_gene_ensembl' , mart = mart )
ensembl = getBM( attributes = c('ensembl_gene_id','external_gene_name') , mart=mart)
names(ensembl) = c("ensembl_ID","gene_symbol")


t1.m.ann=merge(ensembl,t1.m,by.y=0,by.x="gene_symbol",all.y = T)

names(ensembl) = c("ensembl_ID","gene_symbol")
ensembl.dedup=ensembl[!duplicated(ensembl$gene_symbol),]


counter=1
for (i in 1:length(t1.m.ann$ensembl_ID)){
  if (is.na(t1.m.ann$ensembl_ID[i])){
    t1.m.ann$ensembl_ID[i]=paste0("FAKE",counter)
    counter=counter+1}
}
genes<- t1.m.ann[,2:1]

Create count matrix

counts<-t1.m.ann[,-1]

#names(count)=gsub("X","",names(count))

#if the “-“ is changed to “.” by R, using this script to replace “-“ to “.”
colnames(counts)=gsub("[.]","-",colnames(counts))





write.table(obs, "observations.tab", sep = "\t", quote =  FALSE, row.names = FALSE)
write.table(genes, "genes.tab", sep = "\t", quote =  FALSE, row.names = FALSE)
write.table(counts, "expression.tab", sep = "\t", quote =  FALSE, row.names = FALSE, col.names = TRUE)

Zip file together for uploading

system( 'tar -czvf upload.tar.gz *.tab')

C. Prepare the files from SingleCellExperiment object

library(Rtsne);library(SingleCellExperiment);library(CellTrails);library(scuttle)
BP_Data <- readRDS("example.rds")
class(BP_Data)

#SCNorm, get the normalized count matrix

data <- scuttle::logNormCounts(BP_Data )

exp=data.frame(logcounts(data ))

ann=rowData(data)
ann$gene_symbol=row.names(ann)

exp.ann=merge(ann,exp,by.y=0,by.x=0)

names(exp.ann)[1]="gene_symbol"
write.table(exp.ann[-1],file="expression.tab",sep="\t",row.names=F,quote=F)

write.table(exp.ann[c(2,1)],file="genes.tab",sep="\t",row.names=F,quote=F)

raw.obs=data.frame(names(exp))
names(raw.obs)="observations"

obs=data.frame(BP_Data@colData)

obs1=cbind(data.frame(row.names(obs)),obs[,])
names(obs1)[1]="observations"
obs1$cell_type=obs1$CellTrails.state
reducedDimNames(BP_Data)

tsne.cor=data.frame(reducedDim(BP_Data, "CellTrails.tSNE")[,1:2])

colnames(tsne.cor)=c("tSNE_1","tSNE_2")

obs2=merge(obs1,tsne.cor,by.x=0,by.y=0)

#rearrange the columns
obs3=obs2[c(2,3,4,5,6,7)]
write.table(obs3=,file="observations.tab",sep="\t",row.names=F,quote=F)

####compress files together for uploading
 
 system( 'tar -czvf upload.tar.gz *.tab')