import pandas as pd
import numpy as np
import os
import sys
import time
import scanpy as sc
import matplotlib.pyplot as plt

D:\Users\kndong\software\anaconda\lib\site-packages\h5py\__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters

1. Load scAND Module¶

sys.path.append('scAND-code/')
import scAND

np.random.seed(2019)

2. Load Data¶

The input of scAND contains three parts:

A pandas.Series() represents the barcode of cells
A pandas.Series() represents the region of peaks
A pandas.DataFrame() contains 3 columns: Peaks, Cells, Counts (The number of Peaks and Cells start as 1). Each row of the DataFrame represents an element in scATAC-seq data.

Count_df = pd.read_csv('Example/Count_df.txt', sep='\t')
cells = pd.read_csv('Example/Count_Cells.txt', sep='\t', header=None)
cells = cells[0]
peaks = pd.read_csv('Example/Count_Peaks.txt', sep='\t', header=None)
peaks = peaks[0]

cells.head()

0    AGCGATAGAAGAGGCACTAAGCCTCCTATCCT
1    AGCGATAGAAGAGGCACTAAGCCTGTACTGAC
2    AGCGATAGAAGAGGCAGTAAGGAGTATAGCCT
3    AGCGATAGAAGAGGCATAGATCGCCAGGACGT
4    AGCGATAGAAGAGGCATATCCTCTATAGAGGC
Name: 0, dtype: object

peaks.head()

0    chr1:740179-740374
1    chr1:800103-800829
2    chr1:800882-801440
3    chr1:825794-826048
4    chr1:850514-851177
Name: 0, dtype: object

Count_df.head()

2.1 Construct Inputs from scATAC-seq Matrix¶

The get_scAND_inputs() function can be used to construct inputs from scATAC-seq matrix. Note that Each row and column of the used matrix should contain at least one element.

MM = pd.read_csv('Example/GSM1647122_GM12878vsHEK.dhsmatrix.txt.gz', sep='\t')

MM.iloc[:3,:10]

scATAC = MM.iloc[:,4:]
scATAC.index = MM.iloc[:,:4].apply(lambda x: x['chr']+':'+str(x['start'])+'-'+str(x['end']), axis=1)

scATAC.iloc[:3, :5]

Count_df, cells, peaks = scAND.get_scAND_inputs(scATAC)

cells.head()

0    AGCGATAGAAGAGGCACTAAGCCTCCTATCCT
1    AGCGATAGAAGAGGCACTAAGCCTGTACTGAC
2    AGCGATAGAAGAGGCAGTAAGGAGTATAGCCT
3    AGCGATAGAAGAGGCATAGATCGCCAGGACGT
4    AGCGATAGAAGAGGCATATCCTCTATAGAGGC
dtype: object

peaks.head()

0    chr1:740179-740374
1    chr1:800103-800829
2    chr1:800882-801440
3    chr1:825794-826048
4    chr1:850514-851177
dtype: object

Count_df.head()

3. Run scAND¶

Run_scAND(Count_df, d, weights, cells, peaks, random_seed=2019, L2_norm=True, Binary=True, Graph_norm=True, return_peaks=False, verbose=True)

The parameters of Run_scAND() function:

Count_df: A pandas.DataFrame() contains 3 columns: Peaks, Cells, Counts (The number of Peaks and Cells start as 1). Each row of the DataFrame represents an element in scATAC-seq data.
d: The dimensions of the low-dimensional representation of scAND.
weights: The parameter beta in scAND model. The input should be a list() or a np.array(). scAND can calculate results of different parameters simultaneously.
cells: A pandas.Series() represents the barcode of cells
peaks: A pandas.Series() represents the region of peaks
random_seed: The random seed.
L2_norm: Logical, should L2 regularization be applied to the scAND representation? Because L2 regularization is related to the dimensions used, we recommend not to do L2 regularization here, but to do L2 regularization in Get_Result() function.
Binary: Logical, should binarization be applied to the scATAC-seq matrix?
Graph_norm: Logical, should graph normalization be applied to the adjacency matrix of network?
return_peaks: Logical, should the scAND representation of peaks be returned?
verbose: Logical, should the function run silently?

# parameters
beta_list = [0, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]

start_time = time.time()
Rep_cells = scAND.Run_scAND(Count_df=Count_df, d=50, weights=beta_list, cells=cells, peaks=peaks, random_seed=2019,
                            L2_norm=False, Binary=True, Graph_norm=True, return_peaks=False,verbose=True)
end_time = time.time()
print('Time Costing: %f s' %(end_time-start_time))

----------Info of Data:
Number of Cells:  748
Number of Peaks:  104260
----------Constructing Binary Graph...
Dim of Graph:  (105008, 105008)
----------Graph Normalization...
----------Calculating Eigendecomposition...
----------Eigen-decomposition reweighting...
Time Costing: 28.163546 s

used_dim = 10
used_beta = 0.8
used_rep = scAND.Get_Result(Rep_cells, beta=used_beta, dim=used_dim, L2_norm=True)

used_rep.iloc[:5, :5]

4. T-SNE Visualization or UMAP Visualization Using the scanpy Package¶

metadata = pd.read_csv('Example/metadata.txt', sep='\t', header=None, index_col=0)
metadata.columns = ['label']
metadata.head()

adata = sc.AnnData(used_rep)
adata.var_names_make_unique()
adata.obs = metadata.copy()

sc.pp.neighbors(adata, random_state=2019)
sc.tl.tsne(adata, random_state=2019)
sc.tl.umap(adata, random_state=2019)

fig, ax = plt.subplots(figsize=(3, 3))
sc.pl.tsne(adata, color='label', title='scAND', s=15, ax=ax)

fig, ax = plt.subplots(figsize=(3, 3))
sc.pl.umap(adata, color='label', title='scAND', s=15, ax=ax)

WARNING: Consider installing the package MulticoreTSNE (https://github.com/DmitryUlyanov/Multicore-TSNE). Even for n_jobs=1 this speeds up the computation considerably and might yield better converged results.

... storing 'label' as categorical

	PC1	PC2	PC3	PC4	PC5

AGCGATAGAAGAGGCACTAAGCCTCCTATCCT	-0.912691	-0.388534	-0.070045	-0.072976	-0.031168
AGCGATAGAAGAGGCACTAAGCCTGTACTGAC	-0.934024	-0.315168	-0.059602	-0.009940	-0.029809
AGCGATAGAAGAGGCAGTAAGGAGTATAGCCT	-0.901092	0.426732	0.055843	0.007612	0.006670
AGCGATAGAAGAGGCATAGATCGCCAGGACGT	-0.880842	-0.452639	-0.080949	-0.002834	-0.082284
AGCGATAGAAGAGGCATATCCTCTATAGAGGC	-0.845090	0.522160	0.074378	-0.004635	-0.033324

	chr	start	end	annot
0	chr1	740179	740374	Gmspecific
1	chr1	800103	800829	Gmspecific
2	chr1	800882	801440	Gmspecific

	AGCGATAGAAGAGGCACTAAGCCTCCTATCCT	AGCGATAGAAGAGGCACTAAGCCTGTACTGAC	AGCGATAGAAGAGGCAGTAAGGAGTATAGCCT	AGCGATAGAAGAGGCATAGATCGCCAGGACGT	AGCGATAGAAGAGGCATATCCTCTATAGAGGC
chr1:740179-740374	0	0	0	0	0
chr1:800103-800829	0	0	0	0	0
chr1:800882-801440	0	0	0	0	0

	label
0
AGCGATAGAAGAGGCACTAAGCCTCCTATCCT	HEK293T
AGCGATAGAAGAGGCACTAAGCCTGTACTGAC	HEK293T
AGCGATAGAAGAGGCAGTAAGGAGTATAGCCT	GM12878
AGCGATAGAAGAGGCATAGATCGCCAGGACGT	HEK293T
AGCGATAGAAGAGGCATATCCTCTATAGAGGC	GM12878

	Peaks	Cells	Counts
0	1050	1	1
1	6456	1	2
2	7559	1	2
3	10435	1	2
4	10720	1	1

	Peaks	Cells	Counts
0	1	88	2
1	1	129	2
2	1	130	2
3	1	210	1
4	1	243	2