In [1]:
import pandas as pd
import numpy as np
import os
import sys
import time
import scanpy as sc
import matplotlib.pyplot as plt
D:\Users\kndong\software\anaconda\lib\site-packages\h5py\__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters

1. Load scAND Module

In [2]:
sys.path.append('scAND-code/')
import scAND
In [3]:
np.random.seed(2019)

2. Load Data

The input of scAND contains three parts:

  1. A pandas.Series() represents the barcode of cells
  2. A pandas.Series() represents the region of peaks
  3. A pandas.DataFrame() contains 3 columns: Peaks, Cells, Counts (The number of Peaks and Cells start as 1). Each row of the DataFrame represents an element in scATAC-seq data.
In [4]:
Count_df = pd.read_csv('Example/Count_df.txt', sep='\t')
cells = pd.read_csv('Example/Count_Cells.txt', sep='\t', header=None)
cells = cells[0]
peaks = pd.read_csv('Example/Count_Peaks.txt', sep='\t', header=None)
peaks = peaks[0]
In [5]:
cells.head()
Out[5]:
0    AGCGATAGAAGAGGCACTAAGCCTCCTATCCT
1    AGCGATAGAAGAGGCACTAAGCCTGTACTGAC
2    AGCGATAGAAGAGGCAGTAAGGAGTATAGCCT
3    AGCGATAGAAGAGGCATAGATCGCCAGGACGT
4    AGCGATAGAAGAGGCATATCCTCTATAGAGGC
Name: 0, dtype: object
In [6]:
peaks.head()
Out[6]:
0    chr1:740179-740374
1    chr1:800103-800829
2    chr1:800882-801440
3    chr1:825794-826048
4    chr1:850514-851177
Name: 0, dtype: object
In [7]:
Count_df.head()
Out[7]:
Peaks Cells Counts
0 1050 1 1
1 6456 1 2
2 7559 1 2
3 10435 1 2
4 10720 1 1

2.1 Construct Inputs from scATAC-seq Matrix

The get_scAND_inputs() function can be used to construct inputs from scATAC-seq matrix. Note that Each row and column of the used matrix should contain at least one element.

In [8]:
MM = pd.read_csv('Example/GSM1647122_GM12878vsHEK.dhsmatrix.txt.gz', sep='\t')
In [9]:
MM.iloc[:3,:10]
Out[9]:
chr start end annot AGCGATAGAAGAGGCACTAAGCCTCCTATCCT AGCGATAGAAGAGGCACTAAGCCTGTACTGAC AGCGATAGAAGAGGCAGTAAGGAGTATAGCCT AGCGATAGAAGAGGCATAGATCGCCAGGACGT AGCGATAGAAGAGGCATATCCTCTATAGAGGC AGCGATAGAGGCAGAAAAGGAGTATATAGCCT
0 chr1 740179 740374 Gmspecific 0 0 0 0 0 0
1 chr1 800103 800829 Gmspecific 0 0 0 0 0 0
2 chr1 800882 801440 Gmspecific 0 0 0 0 0 0
In [10]:
scATAC = MM.iloc[:,4:]
scATAC.index = MM.iloc[:,:4].apply(lambda x: x['chr']+':'+str(x['start'])+'-'+str(x['end']), axis=1)
In [11]:
scATAC.iloc[:3, :5]
Out[11]:
AGCGATAGAAGAGGCACTAAGCCTCCTATCCT AGCGATAGAAGAGGCACTAAGCCTGTACTGAC AGCGATAGAAGAGGCAGTAAGGAGTATAGCCT AGCGATAGAAGAGGCATAGATCGCCAGGACGT AGCGATAGAAGAGGCATATCCTCTATAGAGGC
chr1:740179-740374 0 0 0 0 0
chr1:800103-800829 0 0 0 0 0
chr1:800882-801440 0 0 0 0 0
In [12]:
Count_df, cells, peaks = scAND.get_scAND_inputs(scATAC)
In [13]:
cells.head()
Out[13]:
0    AGCGATAGAAGAGGCACTAAGCCTCCTATCCT
1    AGCGATAGAAGAGGCACTAAGCCTGTACTGAC
2    AGCGATAGAAGAGGCAGTAAGGAGTATAGCCT
3    AGCGATAGAAGAGGCATAGATCGCCAGGACGT
4    AGCGATAGAAGAGGCATATCCTCTATAGAGGC
dtype: object
In [14]:
peaks.head()
Out[14]:
0    chr1:740179-740374
1    chr1:800103-800829
2    chr1:800882-801440
3    chr1:825794-826048
4    chr1:850514-851177
dtype: object
In [15]:
Count_df.head()
Out[15]:
Peaks Cells Counts
0 1 88 2
1 1 129 2
2 1 130 2
3 1 210 1
4 1 243 2

3. Run scAND

Run_scAND(Count_df, d, weights, cells, peaks, random_seed=2019, L2_norm=True, Binary=True, Graph_norm=True, return_peaks=False, verbose=True)

The parameters of Run_scAND() function:

  1. Count_df: A pandas.DataFrame() contains 3 columns: Peaks, Cells, Counts (The number of Peaks and Cells start as 1). Each row of the DataFrame represents an element in scATAC-seq data.
  2. d: The dimensions of the low-dimensional representation of scAND.
  3. weights: The parameter beta in scAND model. The input should be a list() or a np.array(). scAND can calculate results of different parameters simultaneously.
  4. cells: A pandas.Series() represents the barcode of cells
  5. peaks: A pandas.Series() represents the region of peaks
  6. random_seed: The random seed.
  7. L2_norm: Logical, should L2 regularization be applied to the scAND representation? Because L2 regularization is related to the dimensions used, we recommend not to do L2 regularization here, but to do L2 regularization in Get_Result() function.
  8. Binary: Logical, should binarization be applied to the scATAC-seq matrix?
  9. Graph_norm: Logical, should graph normalization be applied to the adjacency matrix of network?
  10. return_peaks: Logical, should the scAND representation of peaks be returned?
  11. verbose: Logical, should the function run silently?
In [16]:
# parameters
beta_list = [0, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]

start_time = time.time()
Rep_cells = scAND.Run_scAND(Count_df=Count_df, d=50, weights=beta_list, cells=cells, peaks=peaks, random_seed=2019,
                            L2_norm=False, Binary=True, Graph_norm=True, return_peaks=False,verbose=True)
end_time = time.time()
print('Time Costing: %f s' %(end_time-start_time))
----------Info of Data:
Number of Cells:  748
Number of Peaks:  104260
----------Constructing Binary Graph...
Dim of Graph:  (105008, 105008)
----------Graph Normalization...
----------Calculating Eigendecomposition...
----------Eigen-decomposition reweighting...
Time Costing: 28.163546 s
In [17]:
used_dim = 10
used_beta = 0.8
used_rep = scAND.Get_Result(Rep_cells, beta=used_beta, dim=used_dim, L2_norm=True)
In [18]:
used_rep.iloc[:5, :5]
Out[18]:
PC1 PC2 PC3 PC4 PC5
AGCGATAGAAGAGGCACTAAGCCTCCTATCCT -0.912691 -0.388534 -0.070045 -0.072976 -0.031168
AGCGATAGAAGAGGCACTAAGCCTGTACTGAC -0.934024 -0.315168 -0.059602 -0.009940 -0.029809
AGCGATAGAAGAGGCAGTAAGGAGTATAGCCT -0.901092 0.426732 0.055843 0.007612 0.006670
AGCGATAGAAGAGGCATAGATCGCCAGGACGT -0.880842 -0.452639 -0.080949 -0.002834 -0.082284
AGCGATAGAAGAGGCATATCCTCTATAGAGGC -0.845090 0.522160 0.074378 -0.004635 -0.033324

4. T-SNE Visualization or UMAP Visualization Using the scanpy Package

In [19]:
metadata = pd.read_csv('Example/metadata.txt', sep='\t', header=None, index_col=0)
metadata.columns = ['label']
metadata.head()
Out[19]:
label
0
AGCGATAGAAGAGGCACTAAGCCTCCTATCCT HEK293T
AGCGATAGAAGAGGCACTAAGCCTGTACTGAC HEK293T
AGCGATAGAAGAGGCAGTAAGGAGTATAGCCT GM12878
AGCGATAGAAGAGGCATAGATCGCCAGGACGT HEK293T
AGCGATAGAAGAGGCATATCCTCTATAGAGGC GM12878
In [20]:
adata = sc.AnnData(used_rep)
adata.var_names_make_unique()
adata.obs = metadata.copy()
In [21]:
sc.pp.neighbors(adata, random_state=2019)
sc.tl.tsne(adata, random_state=2019)
sc.tl.umap(adata, random_state=2019)

fig, ax = plt.subplots(figsize=(3, 3))
sc.pl.tsne(adata, color='label', title='scAND', s=15, ax=ax)

fig, ax = plt.subplots(figsize=(3, 3))
sc.pl.umap(adata, color='label', title='scAND', s=15, ax=ax)
WARNING: Consider installing the package MulticoreTSNE (https://github.com/DmitryUlyanov/Multicore-TSNE). Even for n_jobs=1 this speeds up the computation considerably and might yield better converged results.
... storing 'label' as categorical