import scanpy as sc
import random
random.seed(1)

import warnings
warnings.filterwarnings('ignore')

pb10k = sc.read_h5ad("../output/33_layer/10k_pbmcs_proc_uce_adata.h5ad")
pb10k

AnnData object with n_obs × n_vars = 11990 × 10809
    obs: 'n_counts', 'batch', 'labels', 'str_labels', 'cell_type', 'n_genes'
    var: 'gene_symbols', 'n_counts-0', 'n_counts-1', 'n_counts', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'n_cells'
    uns: 'cell_types', 'hvg'
    obsm: 'X_uce', 'design', 'normalized_qc', 'qc_pc', 'raw_qc'


# Pull out X_uce from obsm
uce = pb10k.obsm["X_uce"]
uce.shape

(11990, 1280)


# Plot the first two dimensions
import matplotlib.pyplot as plt
plt.scatter(uce[:, 0], uce[:, 1], s=1)
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.title("UCE Embedding")
plt.show()


import umap

# Run UMAP on the embedding
reducer = umap.UMAP()
embedding = reducer.fit_transform(uce)

# Plot it
plt.scatter(embedding[:, 0], embedding[:, 1], s=1)
plt.xlabel("UMAP 1")
plt.ylabel("UMAP 2")
plt.title("UMAP Embedding")
plt.show()


# Show metadata in the anndata object
pb10k.obs.head()


# Check to see if the metadata columns str_labels and cell_type are redundant
(pb10k.obs["str_labels"] == pb10k.obs["cell_type"]).all()

np.True_


# Use scanpy to plot the UMAP
pb10k.obsm['X_umap'] = embedding
sc.pl.umap(pb10k, color='cell_type')


# Upload the PBMC 3k dataset
pb3k = sc.read_h5ad("../output/33_layer/pbmc3k_raw_uce_adata.h5ad")
pb3k

AnnData object with n_obs × n_vars = 2700 × 9714
    obs: 'n_genes'
    var: 'gene_ids', 'n_cells'
    obsm: 'X_uce'


# Read in the data
pb3k_proc = sc.read_h5ad("../output/4_layer/pbmc3k_processed_proc.h5ad")

cell_type = pb3k_proc.obs["louvain"]
cell_type

index
AAACATACAACCAC-1        CD4 T cells
AAACATTGAGCTAC-1            B cells
AAACATTGATCAGC-1        CD4 T cells
AAACCGTGCTTCCG-1    CD14+ Monocytes
AAACCGTGTATGCG-1           NK cells
                         ...       
TTTCGAACTCTCAT-1    CD14+ Monocytes
TTTCTACTGAGGCA-1            B cells
TTTCTACTTCCTCG-1            B cells
TTTGCATGAGAGGC-1            B cells
TTTGCATGCCTCAC-1        CD4 T cells
Name: louvain, Length: 2638, dtype: category
Categories (8, object): ['CD4 T cells', 'CD14+ Monocytes', 'B cells', 'CD8 T cells', 'NK cells', 'FCGR3A+ Monocytes', 'Dendritic cells', 'Megakaryocytes']


# Add cell_type to pb3k.obs, given the index for each, but don't name index
pb3k.obs["cell_type"] = cell_type
pb3k.obs


# Run umap on the PBMC 3k dataset
pb3k_embedding = reducer.fit_transform(pb3k.obsm["X_uce"])

# Add umap to the obs of the pb3k dataset
pb3k.obsm["X_umap"] = pb3k_embedding

# Use scanpy umap plot
sc.pl.umap(pb3k, color='cell_type', title='PBMC 3k UMAP colored by cell type')


# Combine the two datasets, making sure they have distinct names in the metadata
pb10k.obs["dataset"] = "10k"
pb3k.obs["dataset"] = "3k"

com = pb10k.concatenate(pb3k)
com

AnnData object with n_obs × n_vars = 14690 × 8235
    obs: 'n_counts', 'batch', 'labels', 'str_labels', 'cell_type', 'n_genes', 'dataset'
    var: 'gene_symbols-0', 'n_counts-0-0', 'n_counts-1-0', 'n_counts-0', 'highly_variable-0', 'highly_variable_rank-0', 'means-0', 'variances-0', 'variances_norm-0', 'n_cells-0', 'n_cells-1', 'gene_ids-1'
    obsm: 'X_uce', 'X_umap'


com.obs


# Make a umap from the combined data
com_uce = com.obsm["X_uce"]
com_reducer = umap.UMAP(random_state=1) # Good states: 1, 5, 6
com_embedding = com_reducer.fit_transform(com_uce)

com.obsm["X_umap"] = com_embedding 
sc.pl.umap(com, color='dataset', title='Combined PBMC UMAP colored by dataset')


sc.pl.umap(com, color='cell_type', title='Combined UMAP colored by cell type')


# Save the combined anndata object
com.write_h5ad("../output/33_layer/combined_pbmc_uce_adata.h5ad")


# Make PCA of com uce space without scanpy
from sklearn.decomposition import PCA

pca = PCA(n_components=50)
pca.fit(com_uce)
com_pca = pca.transform(com_uce)

# Plot the first two dimensions in scanpy
com.obsm["X_pca"] = com_pca
sc.pl.pca(com, color='dataset', title='Combined PCA colored by dataset')

# And plot by cell type
sc.pl.pca(com, color='cell_type', title='Combined PCA colored by cell type')

	n_counts	batch	labels	str_labels	cell_type	n_genes	dataset
AAACCTGAGCTAGTGG-1-0	4520.0	0	2.0	CD4 T cells	CD4 T cells	735	10k
AAACCTGCACATTAGC-1-0	2788.0	0	2.0	CD4 T cells	CD4 T cells	449	10k
AAACCTGCACTGTTAG-1-0	4667.0	0	1.0	CD14+ Monocytes	CD14+ Monocytes	942	10k
AAACCTGCATAGTAAG-1-0	4440.0	0	1.0	CD14+ Monocytes	CD14+ Monocytes	924	10k
AAACCTGCATGAACCT-1-0	3224.0	0	3.0	CD8 T cells	CD8 T cells	691	10k
...	...	...	...	...	...	...	...
TTTCGAACTCTCAT-1-1	NaN	1	NaN	NaN	CD14+ Monocytes	1148	3k
TTTCTACTGAGGCA-1-1	NaN	1	NaN	NaN	B cells	1215	3k
TTTCTACTTCCTCG-1-1	NaN	1	NaN	NaN	B cells	618	3k
TTTGCATGAGAGGC-1-1	NaN	1	NaN	NaN	B cells	449	3k
TTTGCATGCCTCAC-1-1	NaN	1	NaN	NaN	CD4 T cells	722	3k

Home ¶

Respective PBMC datasets run through Universal Cell Embeddings do not sit on top of each other in UMAP space¶

	n_counts	labels	str_labels	cell_type	n_genes
AAACCTGAGCTAGTGG-1	4520.0	2	CD4 T cells	CD4 T cells	735
AAACCTGCACATTAGC-1	2788.0	2	CD4 T cells	CD4 T cells	449
AAACCTGCACTGTTAG-1	4667.0	1	CD14+ Monocytes	CD14+ Monocytes	942
AAACCTGCATAGTAAG-1	4440.0	1	CD14+ Monocytes	CD14+ Monocytes	924
AAACCTGCATGAACCT-1	3224.0	3	CD8 T cells	CD8 T cells	691

	n_genes	cell_type
index
AAACATACAACCAC-1	778	CD4 T cells
AAACATTGAGCTAC-1	1346	B cells
AAACATTGATCAGC-1	1126	CD4 T cells
AAACCGTGCTTCCG-1	953	CD14+ Monocytes
AAACCGTGTATGCG-1	520	NK cells
...	...	...
TTTCGAACTCTCAT-1	1148	CD14+ Monocytes
TTTCTACTGAGGCA-1	1215	B cells
TTTCTACTTCCTCG-1	618	B cells
TTTGCATGAGAGGC-1	449	B cells
TTTGCATGCCTCAC-1	722	CD4 T cells

Home¶

Respective PBMC datasets run through Universal Cell Embeddings do not sit on top of each other in UMAP space¶

Home ¶