{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Overview\n", "\n", "In this notebook, we will annotate the transcription starts sites (TSSs) in bulk scATAC-seq data to get the active promoter information for base GRN construction.\n", "\n", "### Notebook file\n", "Notebook file is available here.\n", "https://github.com/morris-lab/CellOracle/blob/master/docs/notebooks/01_ATAC-seq_data_processing/option2_Bulk_ATAC-seq_data/01_preprocess_Bulk_ATAC_seq_peak_data.ipynb" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 0. Import libraries" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "\n", "import seaborn as sns\n", "\n", "import os, sys, shutil, importlib, glob\n", "from tqdm import tqdm_notebook as tqdm\n", "\n", "%config InlineBackend.figure_format = 'retina'\n", "\n", "plt.rcParams['figure.figsize'] = [6, 4.5]\n", "plt.rcParams[\"savefig.dpi\"] = 300\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Import celloracle function\n", "from celloracle import motif_analysis as ma" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 1. Load input data\n", "\n", "Import ATAC-seq bed file.\n", "This script can also be used with DNase-seq or Chip-seq data." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1.0. Download data\n", "\n", "\n", "Here, we use bulk ATAC-seq data.\n", "Please prepare the bulk ATAC-seq data as a bed file format.\n", "\n", "You can download the demo file by running the following command:\n", "\n", "Note: If the file download fails, please manually download and unzip the data.\n", "\n", "https://raw.githubusercontent.com/morris-lab/CellOracle/master/docs/demo_data/bulk_ATAC_seq_peak_data.bed\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2021-07-07 21:38:59-- https://raw.githubusercontent.com/morris-lab/CellOracle/master/docs/demo_data/bulk_ATAC_seq_peak_data.bed\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 10446347 (10.0M) [text/plain]\n", "Saving to: ‘bulk_ATAC_seq_peak_data.bed’\n", "\n", "bulk_ATAC_seq_peak_ 100%[===================>] 9.96M --.-KB/s in 0.1s \n", "\n", "2021-07-07 21:39:00 (80.3 MB/s) - ‘bulk_ATAC_seq_peak_data.bed’ saved [10446347/10446347]\n", "\n" ] } ], "source": [ "\n", "# Download file. \n", "!wget https://raw.githubusercontent.com/morris-lab/CellOracle/master/docs/demo_data/bulk_ATAC_seq_peak_data.bed\n", " \n", "# If you are using macOS, please try the following command.\n", "#!curl -O https://raw.githubusercontent.com/morris-lab/CellOracle/master/docs/demo_data/bulk_ATAC_seq_peak_data.bed" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1.1. Load bed file" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(436206, 4)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
chromstartendseqname
0chr130024783002968chr1_3002478_3002968
1chr130847393085712chr1_3084739_3085712
2chr131035763104022chr1_3103576_3104022
3chr131068713107210chr1_3106871_3107210
4chr131089323109158chr1_3108932_3109158
\n", "
" ], "text/plain": [ " chrom start end seqname\n", "0 chr1 3002478 3002968 chr1_3002478_3002968\n", "1 chr1 3084739 3085712 chr1_3084739_3085712\n", "2 chr1 3103576 3104022 chr1_3103576_3104022\n", "3 chr1 3106871 3107210 chr1_3106871_3107210\n", "4 chr1 3108932 3109158 chr1_3108932_3109158" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Load bed_file\n", "file_path_of_bed_file = \"bulk_ATAC_seq_peak_data.bed\" \n", "bed = ma.read_bed(file_path_of_bed_file)\n", "print(bed.shape)\n", "bed.head()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['chr1_3002478_3002968', 'chr1_3084739_3085712',\n", " 'chr1_3103576_3104022', ..., 'chrY_631222_631480',\n", " 'chrY_795887_796426', 'chrY_2397419_2397628'], dtype=object)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Convert bed file into peak name list\n", "peaks = ma.process_bed_file.df_to_list_peakstr(bed)\n", "peaks" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 2. Make TSS annotation\n", "IMPORTANT: Please make sure that you are setting the correct reference genome!" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "que bed peaks: 436206\n", "tss peaks in que: 24822\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
chrstartendgene_short_namestrand
24817chr26056021160561602Itgb6-
24818chr1539751773978654BC037032-
24819chr146769070167692101Ppp2r2a-
24820chr174845524748455773B430306N03Rik+
24821chr105986119259861608Gm17455+
\n", "
" ], "text/plain": [ " chr start end gene_short_name strand\n", "24817 chr2 60560211 60561602 Itgb6 -\n", "24818 chr15 3975177 3978654 BC037032 -\n", "24819 chr14 67690701 67692101 Ppp2r2a -\n", "24820 chr17 48455247 48455773 B430306N03Rik +\n", "24821 chr10 59861192 59861608 Gm17455 +" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tss_annotated = ma.get_tss_info(peak_str_list=peaks, ref_genome=\"mm9\")\n", "\n", "# Check results\n", "tss_annotated.tail()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(24822, 2)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
peak_idgene_short_name
0chr7_50691730_50692032Nkg7
1chr7_50692077_50692785Nkg7
2chr13_93564413_93564836Thbs4
3chr13_14613429_14615645Hecw1
4chr3_99688753_99689665Spag17
\n", "
" ], "text/plain": [ " peak_id gene_short_name\n", "0 chr7_50691730_50692032 Nkg7\n", "1 chr7_50692077_50692785 Nkg7\n", "2 chr13_93564413_93564836 Thbs4\n", "3 chr13_14613429_14615645 Hecw1\n", "4 chr3_99688753_99689665 Spag17" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Change format\n", "peak_id_tss = ma.process_bed_file.df_to_list_peakstr(tss_annotated)\n", "tss_annotated = pd.DataFrame({\"peak_id\": peak_id_tss,\n", " \"gene_short_name\": tss_annotated.gene_short_name.values})\n", "tss_annotated = tss_annotated.reset_index(drop=True)\n", "print(tss_annotated.shape)\n", "tss_annotated.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 3. Save data" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "tss_annotated.to_csv(\"processed_peak_file.csv\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Please go to the next step: Transcriptioin factor motif scan**\n", "\n", "https://morris-lab.github.io/CellOracle.documentation/tutorials/motifscan.html" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "finalized": { "timestamp": 1642784326953, "trusted": true }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.10" } }, "nbformat": 4, "nbformat_minor": 2 }