{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Analyze Melanoma Patient 78\n", "\n", "Here we perform analysis on the gene expressions of cells from the melanoma patient number 78. \n", "For simplicity we have converted the dataset into TPM.\n", "The original count data is available at Gene Expression Omnibus: [GSE72056](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE72056)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Import necessary packages" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 1" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import sys\n", "\n", "import pandas as pd\n", "import numpy as np\n", "import pickle as pkl\n", "import sklearn as skl\n", "import sklearn.preprocessing\n", "import scipy.stats\n", "\n", "import matplotlib as mpl\n", "\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Warning information from TensorFlow may occur. It doesn't matter." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "scrolled": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/shaoheng/.conda/envs/tensorflow-gpu/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", " from ._conv import register_converters as _register_converters\n" ] } ], "source": [ "import tensorflow as tf\n", "tf.set_random_seed(1)\n", "import cyclum\n", "from cyclum import writer" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "input_file_mask = 'data/melanoma/M78_tumor'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read data\n", "We do not have cell-cycle labels for the cells any more." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def preprocess(input_file_mask):\n", " \"\"\"\n", " Read in data and perform log transform (log2(x+1)), centering (mean = 1) and scaling (sd = 1).\n", " \"\"\"\n", " sttpm = writer.read_df_from_binary(input_file_mask)\n", "\n", " label = pd.read_csv(input_file_mask + '-label.csv', sep=\"\\t\", index_col=0)\n", " return sttpm, label\n", "\n", "sttpm, label = preprocess(input_file_mask)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There is no convention whether cells should be columns or rows. Here we require cells to be rows." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | C9orf152 | \n", "RPS11 | \n", "ELMO2 | \n", "CREB3L1 | \n", "PNMA1 | \n", "MMP2 | \n", "TMEM216 | \n", "TRAF3IP2-AS1 | \n", "LRRC37A5P | \n", "LOC653712 | \n", "... | \n", "GPLD1 | \n", "SNORD115-39 | \n", "RAB8A | \n", "RXFP2 | \n", "PCIF1 | \n", "PIK3IP1 | \n", "SNRPD2 | \n", "SLC39A6 | \n", "CTSC | \n", "AQP7 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
cy78-CD45-neg-1-B04-S496-comb | \n", "-0.101847 | \n", "0.007892 | \n", "0.904987 | \n", "0.0 | \n", "0.940964 | \n", "0.362364 | \n", "1.468877 | \n", "-0.794159 | \n", "0.0 | \n", "-0.137973 | \n", "... | \n", "-0.443219 | \n", "0.0 | \n", "-0.001803 | \n", "-0.087706 | \n", "-1.099878 | \n", "0.124245 | \n", "0.069167 | \n", "0.333042 | \n", "-0.390829 | \n", "-0.133081 | \n", "
cy78-CD45-neg-3-H06-S762-comb | \n", "-0.101847 | \n", "-0.202088 | \n", "0.875485 | \n", "0.0 | \n", "1.228029 | \n", "2.059569 | \n", "1.475918 | \n", "-0.433542 | \n", "0.0 | \n", "-0.137973 | \n", "... | \n", "0.074614 | \n", "0.0 | \n", "0.658312 | \n", "-0.087706 | \n", "0.795460 | \n", "0.865745 | \n", "0.430293 | \n", "0.688241 | \n", "-0.973564 | \n", "-0.133081 | \n", "
cy78-CD45-neg-1-D07-S523-comb | \n", "-0.101847 | \n", "0.048798 | \n", "-0.044918 | \n", "0.0 | \n", "0.092909 | \n", "-0.693664 | \n", "1.148125 | \n", "-0.361802 | \n", "0.0 | \n", "-0.137973 | \n", "... | \n", "-0.939599 | \n", "0.0 | \n", "0.951421 | \n", "-0.087706 | \n", "0.263022 | \n", "-0.508476 | \n", "0.559235 | \n", "0.116920 | \n", "0.332409 | \n", "-0.133081 | \n", "
cy78-CD45-neg-3-D01-S709-comb | \n", "-0.101847 | \n", "0.025618 | \n", "-1.046433 | \n", "0.0 | \n", "0.457558 | \n", "-0.693664 | \n", "1.070903 | \n", "-0.206473 | \n", "0.0 | \n", "-0.137973 | \n", "... | \n", "-0.432214 | \n", "0.0 | \n", "1.198078 | \n", "-0.087706 | \n", "1.066057 | \n", "-0.508476 | \n", "0.920172 | \n", "0.153443 | \n", "-1.072105 | \n", "-0.133081 | \n", "
cy78-CD45-neg-2-B08-S596-comb | \n", "-0.101847 | \n", "0.143624 | \n", "1.412362 | \n", "0.0 | \n", "0.253140 | \n", "-0.693664 | \n", "-0.583134 | \n", "-1.203133 | \n", "0.0 | \n", "-0.137973 | \n", "... | \n", "1.698251 | \n", "0.0 | \n", "0.974617 | \n", "-0.087706 | \n", "-0.521885 | \n", "0.187822 | \n", "-0.166806 | \n", "0.687519 | \n", "-0.361164 | \n", "-0.133081 | \n", "
5 rows × 23686 columns
\n", "\n", " | tumor | \n", "malignant(1=no,2=yes,0=unresolved) | \n", "non-malignant cell type (1=T,2=B,3=Macro.4=Endo.,5=CAF;6=NK) | \n", "
---|---|---|---|
cy78-CD45-neg-1-B04-S496-comb | \n", "78 | \n", "2 | \n", "0 | \n", "
cy78-CD45-neg-3-H06-S762-comb | \n", "78 | \n", "2 | \n", "0 | \n", "
cy78-CD45-neg-1-D07-S523-comb | \n", "78 | \n", "2 | \n", "0 | \n", "
cy78-CD45-neg-3-D01-S709-comb | \n", "78 | \n", "2 | \n", "0 | \n", "
cy78-CD45-neg-2-B08-S596-comb | \n", "78 | \n", "2 | \n", "0 | \n", "