import scanpy as sc
import warnings
warnings.filterwarnings('ignore')

full = sc.read_h5ad("../data/pbmc3k_raw_uce_adata.h5ad")
cut = sc.read_h5ad("../output/pbmc3k_raw_filtered_uce_adata.h5ad")
random = sc.read_h5ad("../output/pbmc3k_random_filter_uce_adata.h5ad")
proc = sc.read_h5ad("../data/pbmc3k_processed.h5ad")


print(full) # Positive control
print(cut) # Experimental
print(random) # Negative control

print(proc) # Where we get the cell type annotations

AnnData object with n_obs × n_vars = 2700 × 9714
    obs: 'n_genes'
    var: 'gene_ids', 'n_cells'
    obsm: 'X_uce'
AnnData object with n_obs × n_vars = 2700 × 1509
    obs: 'n_genes'
    var: 'gene_ids', 'n_cells'
    obsm: 'X_uce'
AnnData object with n_obs × n_vars = 2392 × 528
    obs: 'n_genes'
    var: 'gene_ids', 'n_cells'
    obsm: 'X_uce'
AnnData object with n_obs × n_vars = 2638 × 1838
    obs: 'n_genes', 'percent_mito', 'n_counts', 'louvain'
    var: 'n_cells'
    uns: 'draw_graph', 'louvain', 'louvain_colors', 'neighbors', 'pca', 'rank_genes_groups'
    obsm: 'X_pca', 'X_tsne', 'X_umap', 'X_draw_graph_fr'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'


# Note the total number of genes used
print(full.shape)
print(cut.shape)
print(random.shape)
print(proc.shape)

(2700, 9714)
(2700, 1509)
(2392, 528)
(2638, 1838)


# Make dataset names in the metadata for each
full.obs["dataset"] = "full"
cut.obs["dataset"] = "cut"
random.obs["dataset"] = "random"


# This works because it goes off of the cell IDs
full.obs["louvain"] = proc.obs["louvain"]
cut.obs["louvain"] = proc.obs["louvain"]
random.obs["louvain"] = proc.obs["louvain"]


# But given that proc is smaller than full and cut, we get a few NaNs
# So we remove them here
full = full[full.obs["louvain"].notnull()]
cut = cut[cut.obs["louvain"].notnull()]
random = random[random.obs["louvain"].notnull()]


# Concatenate the three datasets
adata = full.concatenate(cut, random)


adata.obs


import umap.umap_ as umap

# Make a UMAP of the concatenated datasets, colored by dataset
# Make the umap outside of scanpy
reducer = umap.UMAP(random_state=1)
embedding = reducer.fit_transform(adata.obsm["X_uce"])
adata.obsm["X_umap"] = embedding


# Plot the umap, which is in the obsm["X_umap"] slot
# using sc.pl.umap
sc.pl.umap(adata, color="dataset")
sc.pl.umap(adata, color ="louvain")


# Run pca from X_uce, outside of scanpy
from sklearn.decomposition import PCA

pca = PCA(n_components=50)
pca.fit(adata.obsm["X_uce"])
pca_embedding = pca.transform(adata.obsm["X_uce"])

# Put the pca embedding into the obsm slot
adata.obsm["X_pca"] = pca_embedding


# Plot the x_PCA slot in obsm colored by dataset
sc.pl.pca(adata, color="dataset")
sc.pl.pca(adata, color ="louvain")


# Run KNN in UCE space of full, cut, and random datasets
from sklearn.neighbors import NearestNeighbors
import numpy as np

# Find knn of X_uce for each individual dataset
knn = NearestNeighbors(n_neighbors=10)
knn.fit(full.obsm["X_uce"])
full_knn = knn.kneighbors(return_distance=False)

knn = NearestNeighbors(n_neighbors=10)
knn.fit(cut.obsm["X_uce"])
cut_knn = knn.kneighbors(return_distance=False)

knn = NearestNeighbors(n_neighbors=10)
knn.fit(random.obsm["X_uce"])
random_knn = knn.kneighbors(return_distance=False)


# For each KNN, find the cell types of the neighbors (obs["louvain"])
full_knn_subsets = [full.obs["louvain"].iloc[i].tolist() for i in full_knn]
cut_knn_subsets = [cut.obs["louvain"].iloc[i].tolist() for i in cut_knn]
random_knn_subsets = [random.obs["louvain"].iloc[i].tolist() for i in random_knn]

# What this looks like
random_knn_subsets[:2]

[['CD4 T cells',
  'CD4 T cells',
  'CD4 T cells',
  'CD4 T cells',
  'CD8 T cells',
  'CD4 T cells',
  'CD4 T cells',
  'CD14+ Monocytes',
  'CD4 T cells',
  'CD4 T cells'],
 ['B cells',
  'CD4 T cells',
  'CD4 T cells',
  'B cells',
  'CD4 T cells',
  'FCGR3A+ Monocytes',
  'CD4 T cells',
  'CD4 T cells',
  'B cells',
  'CD4 T cells']]


#  Calcuate the Shannon Entropy of the KNN cell types
from scipy.stats import entropy
import numpy as np

# Assuming full_knn_subsets, cut_knn_subsets, and random_knn_subsets are defined
full_knn_entropy = [entropy(np.unique(i, return_counts=True)[1] / len(i)) for i in full_knn_subsets]
cut_knn_entropy = [entropy(np.unique(i, return_counts=True)[1] / len(i)) for i in cut_knn_subsets]
random_knn_entropy = [entropy(np.unique(i, return_counts=True)[1] / len(i)) for i in random_knn_subsets]

# Compare with neighborhood composition in the previous code block
print(random_knn_entropy[:2])

# Example with min (0) entropy: all values are the same
values_min = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
probs_min = np.bincount(values_min)[1:] / len(values_min)  # Probability distribution
print(probs_min)
print(entropy(probs_min))  # Entropy should be 0

# Example with max entropy: all values are equally likely
values_max = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
probs_max = np.bincount(values_max)[1:] / len(values_max)  # Probability distribution
print(probs_max)
print(entropy(probs_max))  # Higher entropy

[0.639031859650177, 0.8979457248567798]
[1.]
0.0
[0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1]
2.3025850929940455


import numpy as np
import matplotlib.pyplot as plt

# Assuming full_knn_entropy, cut_knn_entropy, random_knn_entropy are defined

# Calculate means
means = [np.mean(full_knn_entropy), np.mean(cut_knn_entropy), np.mean(random_knn_entropy)]

# Calculate standard deviations for error bars
errors = [np.std(full_knn_entropy), np.std(cut_knn_entropy), np.std(random_knn_entropy)]

# Create the bar plot with error bars
plt.bar(["full", "cut", "random"], means, yerr=errors, capsize=5, color=['blue', 'green', 'orange'])

# Add labels and title
plt.ylabel('Mean Entropy')
plt.title('Mean Entropy of KNN Subsets with Error Bars')

# Show plot
plt.show()


import matplotlib.pyplot as plt

# Plot the entropy distributions as histograms with edge colors and improved settings
plt.hist(full_knn_entropy, bins=20, alpha=0.5, label="full", edgecolor='black')
plt.hist(cut_knn_entropy, bins=20, alpha=0.5, label="cut", edgecolor='blue')
plt.hist(random_knn_entropy, bins=20, alpha=0.5, label="random", edgecolor='green')

# Set labels and title for clarity
plt.xlabel("Entropy")
plt.ylabel("Frequency")
plt.title("KNN Entropy Distributions")

# Add a legend outside the plot to avoid overlap
plt.legend(loc='upper right')

# Display the plot
plt.show()

	n_genes	dataset	louvain	batch
index
AAACATACAACCAC-1-0	778	full	CD4 T cells	0
AAACATTGAGCTAC-1-0	1346	full	B cells	0
AAACATTGATCAGC-1-0	1126	full	CD4 T cells	0
AAACCGTGCTTCCG-1-0	953	full	CD14+ Monocytes	0
AAACCGTGTATGCG-1-0	520	full	NK cells	0
...	...	...	...	...
TTTCGAACACCTGA-1-2	64	random	Dendritic cells	2
TTTCGAACTCTCAT-1-2	52	random	CD14+ Monocytes	2
TTTCTACTGAGGCA-1-2	58	random	B cells	2
TTTCTACTTCCTCG-1-2	29	random	B cells	2
TTTGCATGCCTCAC-1-2	33	random	CD4 T cells	2

Home ¶

Running the PBMC 3k datast with most variable genes through UCE leads to worse cell separation than running the whole dataset¶

Home¶

Running the PBMC 3k datast with most variable genes through UCE leads to worse cell separation than running the whole dataset¶

Home ¶