Top

parse module

survivalvolume/plot.py

Functions and classes for plotting tumour volume vs time and survival endpoints based on volume thresholds

Created by Matthew Wakefield. Copyright (c) 2016 Matthew Wakefield, The Walter and Eliza Hall Institute and The University of Melbourne. All rights reserved.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

#!/usr/bin/env python3
# encoding: utf-8
"""
survivalvolume/plot.py

Functions and classes for plotting tumour volume vs time and survival endpoints based on volume thresholds

Created by Matthew Wakefield.
Copyright (c) 2016  Matthew Wakefield, The Walter and Eliza Hall Institute and The University of Melbourne. All rights reserved.

   
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.


"""

import pandas

__author__ = "Matthew Wakefield"
__copyright__ = "Copyright 2016 Matthew Wakefield, The Walter and Eliza Hall Institute and The University of Melbourne"
__credits__ = ["Matthew Wakefield",]
__license__ = "GPL"
__version__ = "1.2.0"
__maintainer__ = "Matthew Wakefield"
__email__ = "wakefield@wehi.edu.au"
__status__ = "production"

def split_on_nans(data):
    """Split a pandas data frame at rows that contain all null values

    Argument:

        data - a pandas data frame

    Returns:

        a list of pandas data frames

    """
    result = []
    null_lines = data[data.isnull().all(axis=1) == True].index
    start = 0
    for line_index in sorted(null_lines):
        data_subset = data.loc[start:line_index]
        result.append(data_subset)
        start = line_index+1
    return result

def clean_tv_table(dirty_tv_table):
    """The Tumour Volume tables generated by splitting on NaN lines
    have flanking NaN columns and rows
    Returns the table name/title and a pandas dataframe with samples
    as column ids and days as row ids

    Argument:

        dirty_tv_table - a pandas data frame with a title row
                         followed by a header row and rows of
                         data lines, surrounded by arbitrary 
                         NaN null cell entries

    Returns:

        name     - the value of the title row
        tv_table - a pandas data frame with named row columns
                   and row item identifiers
    """
    tv_table = dirty_tv_table.copy()
    tv_table = tv_table.dropna(axis=1,how='all')
    tv_table = tv_table.dropna(axis=0,how='all')
    name = tv_table[1].iloc[0]
    tv_table.columns = tv_table.iloc[1]
    tv_table.index = tv_table['Day']
    tv_table = tv_table.iloc[2:,1:]
    tv_table = tv_table.drop('Mean',axis=1)
    return name,tv_table

def studylog_prism_df_to_tv_tables(df):
    """abstracted from studylog_prism_to_tv_tables to allow sane testing
    Use studylog_prism_to_tv_tables"""
    start_of_tv = df.loc[df[0] == 'Tumor Volume (All Animals)'].index[1]
    end_of_tv = df.loc[df[0] == 'Scatterplot information for Prism'].index[0]
    tv_tables = []
    for x in split_on_nans(df[start_of_tv+1:end_of_tv]):
        if len(x.index) > 3:
            cleaned = clean_tv_table(x)
            if not (cleaned[1].empty):
                tv_tables.append(cleaned)
    return dict(tv_tables)

def studylog_prism_to_tv_tables(xlsx_filename, sheetname='PrismRaw'): #pragma no cover
    """A function for converting study log Absolute TV format Excel files
    to dataframes.
    
    Arguments:

        xlsx_filename - a Studylog Excel Prism output file
        sheetname     - the name of the sheet to extract from
                        Default: 'PrismRaw'

    Returns:

        a python dictionary of {name:dataframe} where name is the
        title of the experimental group and dataframe is a pandas
        data frame with columns for each individual and rows for
        volume measurements at a given time point
    """
    df = pandas.read_excel(xlsx_filename, sheetname=sheetname, header=None)
    return studylog_prism_df_to_tv_tables(df)

def clean_studylog_absolute_tv(absolute_tv_df):
    """Cleans and reformat a dataframe of volume measurements
    that has been extracted from a Studylog Absolute TV excel
    spreadsheet.
    Returns the table name/title and a pandas dataframe with samples
    as column ids and days as row ids

    Argument:

        dirty_tv_table - a pandas data frame where the first
                         three columns are Group, Animal ID	and
                         Study Days Data Type followed by left
                         aligned measurement columns named by day
                         padded by NaN null cell entries.
                         All entries must be from the same group

    Returns:

        tv_table - a pandas data frame with named row columns
                   and row item identifiers
    """
    absolute_tv_df.dropna(axis=1, how='all', inplace=True)
    absolute_tv_df.dropna(axis=0, how='all', inplace=True)
    absolute_tv_df.index = absolute_tv_df['Animal ID']
    return absolute_tv_df.T[3:]

def studylog_absolute_to_tv_tables(xlsx_filename,
                                   sheetname='Absolute TV',
                                   header_length=5): #pragma no cover
    """A function for converting study log Prism format Excel files
    to dataframes.
    
    Arguments:

        xlsx_filename - a Studylog Excel Absolute TV output file
        sheetname     - the name of the sheet to extract from
                        Default: 'Absolute TV'

    Returns:

        a python dictionary of {name:dataframe} where name is the
        title of the experimental group and dataframe is a pandas
        data frame with columns for each individual and rows for
        volume measurements at a given time point
        Note: Raw days are returned - use standardise_days to fix
    """
    absolute_df = pandas.read_excel(xlsx_filename,
                                    sheetname=sheetname,
                                    header=header_length)
    return studylog_absolute_df_to_tv_tables(absolute_df)

def studylog_absolute_df_to_tv_tables(absolute_df):
    """abstracted from studylog_absolute_to_tv_tables to allow sane testing
    Use studylog_absolute_to_tv_tables"""
    absolute_df.sort_values(by=['Group'], inplace=True)
    absolute_df.set_index(keys=['Group'], drop=False, inplace=True)
    groups = absolute_df['Group'].unique().tolist()
    tv_tables = {elem : pandas.DataFrame for elem in groups}
    for key in tv_tables.keys():
        tv_tables[key] = clean_studylog_absolute_tv(absolute_df[:][absolute_df.Group == key])
    return tv_tables

def fixed_length_alternate_steps(start,length,step1,step2):
    """Generate list of numbers that increments buy
    steps of alternating magnitude eg [1,4,8,11,15]
    
    Arguments:

        start    -  value of first entry in list
        length   -  length of list to be generated
        step1    -  the magnitude of odd numbered steps
        step2    -  the magnitude of even numbered steps

    Returns:

        a list of numeric values
    """
    result = []
    x=start
    result.append(x)
    second_step = False
    while len(result) < length:
        if second_step:
            x += step2
        else:
            x += step1
        result.append(x)
        second_step = not second_step
    return result

def standardise_days(dataframe,first_interval=3,second_interval=4):
    """Renumber days in study log files by changing
    day numbers to series incrementing by alternating periods.
    (eg 3 day and 4 day periods) to adjust for individuals going
    on study on different days of the week.
    
    Arguments:

        dataframe          -  a pandas data frame with a
                              day based row index
        first_interval     -  the magnitude of odd numbered steps
        second_interval    -  the magnitude of even numbered steps
    
    Returns:

        a pandas dataframe with standardised days as index.
    """
    dataframe.index = fixed_length_alternate_steps(1,len(dataframe.index),first_interval,second_interval)
    return dataframe

Functions

def clean_studylog_absolute_tv(

absolute_tv_df)

Cleans and reformat a dataframe of volume measurements that has been extracted from a Studylog Absolute TV excel spreadsheet. Returns the table name/title and a pandas dataframe with samples as column ids and days as row ids

Argument:

dirty_tv_table - a pandas data frame where the first
                 three columns are Group, Animal ID     and
                 Study Days Data Type followed by left
                 aligned measurement columns named by day
                 padded by NaN null cell entries.
                 All entries must be from the same group

Returns:

tv_table - a pandas data frame with named row columns
           and row item identifiers
def clean_studylog_absolute_tv(absolute_tv_df):
    """Cleans and reformat a dataframe of volume measurements
    that has been extracted from a Studylog Absolute TV excel
    spreadsheet.
    Returns the table name/title and a pandas dataframe with samples
    as column ids and days as row ids

    Argument:

        dirty_tv_table - a pandas data frame where the first
                         three columns are Group, Animal ID	and
                         Study Days Data Type followed by left
                         aligned measurement columns named by day
                         padded by NaN null cell entries.
                         All entries must be from the same group

    Returns:

        tv_table - a pandas data frame with named row columns
                   and row item identifiers
    """
    absolute_tv_df.dropna(axis=1, how='all', inplace=True)
    absolute_tv_df.dropna(axis=0, how='all', inplace=True)
    absolute_tv_df.index = absolute_tv_df['Animal ID']
    return absolute_tv_df.T[3:]

def clean_tv_table(

dirty_tv_table)

The Tumour Volume tables generated by splitting on NaN lines have flanking NaN columns and rows Returns the table name/title and a pandas dataframe with samples as column ids and days as row ids

Argument:

dirty_tv_table - a pandas data frame with a title row
                 followed by a header row and rows of
                 data lines, surrounded by arbitrary 
                 NaN null cell entries

Returns:

name     - the value of the title row
tv_table - a pandas data frame with named row columns
           and row item identifiers
def clean_tv_table(dirty_tv_table):
    """The Tumour Volume tables generated by splitting on NaN lines
    have flanking NaN columns and rows
    Returns the table name/title and a pandas dataframe with samples
    as column ids and days as row ids

    Argument:

        dirty_tv_table - a pandas data frame with a title row
                         followed by a header row and rows of
                         data lines, surrounded by arbitrary 
                         NaN null cell entries

    Returns:

        name     - the value of the title row
        tv_table - a pandas data frame with named row columns
                   and row item identifiers
    """
    tv_table = dirty_tv_table.copy()
    tv_table = tv_table.dropna(axis=1,how='all')
    tv_table = tv_table.dropna(axis=0,how='all')
    name = tv_table[1].iloc[0]
    tv_table.columns = tv_table.iloc[1]
    tv_table.index = tv_table['Day']
    tv_table = tv_table.iloc[2:,1:]
    tv_table = tv_table.drop('Mean',axis=1)
    return name,tv_table

def fixed_length_alternate_steps(

start, length, step1, step2)

Generate list of numbers that increments buy steps of alternating magnitude eg [1,4,8,11,15]

Arguments:

start    -  value of first entry in list
length   -  length of list to be generated
step1    -  the magnitude of odd numbered steps
step2    -  the magnitude of even numbered steps

Returns:

a list of numeric values
def fixed_length_alternate_steps(start,length,step1,step2):
    """Generate list of numbers that increments buy
    steps of alternating magnitude eg [1,4,8,11,15]
    
    Arguments:

        start    -  value of first entry in list
        length   -  length of list to be generated
        step1    -  the magnitude of odd numbered steps
        step2    -  the magnitude of even numbered steps

    Returns:

        a list of numeric values
    """
    result = []
    x=start
    result.append(x)
    second_step = False
    while len(result) < length:
        if second_step:
            x += step2
        else:
            x += step1
        result.append(x)
        second_step = not second_step
    return result

def split_on_nans(

data)

Split a pandas data frame at rows that contain all null values

Argument:

data - a pandas data frame

Returns:

a list of pandas data frames
def split_on_nans(data):
    """Split a pandas data frame at rows that contain all null values

    Argument:

        data - a pandas data frame

    Returns:

        a list of pandas data frames

    """
    result = []
    null_lines = data[data.isnull().all(axis=1) == True].index
    start = 0
    for line_index in sorted(null_lines):
        data_subset = data.loc[start:line_index]
        result.append(data_subset)
        start = line_index+1
    return result

def standardise_days(

dataframe, first_interval=3, second_interval=4)

Renumber days in study log files by changing day numbers to series incrementing by alternating periods. (eg 3 day and 4 day periods) to adjust for individuals going on study on different days of the week.

Arguments:

dataframe          -  a pandas data frame with a
                      day based row index
first_interval     -  the magnitude of odd numbered steps
second_interval    -  the magnitude of even numbered steps

Returns:

a pandas dataframe with standardised days as index.
def standardise_days(dataframe,first_interval=3,second_interval=4):
    """Renumber days in study log files by changing
    day numbers to series incrementing by alternating periods.
    (eg 3 day and 4 day periods) to adjust for individuals going
    on study on different days of the week.
    
    Arguments:

        dataframe          -  a pandas data frame with a
                              day based row index
        first_interval     -  the magnitude of odd numbered steps
        second_interval    -  the magnitude of even numbered steps
    
    Returns:

        a pandas dataframe with standardised days as index.
    """
    dataframe.index = fixed_length_alternate_steps(1,len(dataframe.index),first_interval,second_interval)
    return dataframe

def studylog_absolute_df_to_tv_tables(

absolute_df)

abstracted from studylog_absolute_to_tv_tables to allow sane testing Use studylog_absolute_to_tv_tables

def studylog_absolute_df_to_tv_tables(absolute_df):
    """abstracted from studylog_absolute_to_tv_tables to allow sane testing
    Use studylog_absolute_to_tv_tables"""
    absolute_df.sort_values(by=['Group'], inplace=True)
    absolute_df.set_index(keys=['Group'], drop=False, inplace=True)
    groups = absolute_df['Group'].unique().tolist()
    tv_tables = {elem : pandas.DataFrame for elem in groups}
    for key in tv_tables.keys():
        tv_tables[key] = clean_studylog_absolute_tv(absolute_df[:][absolute_df.Group == key])
    return tv_tables

def studylog_absolute_to_tv_tables(

xlsx_filename, sheetname='Absolute TV', header_length=5)

A function for converting study log Prism format Excel files to dataframes.

Arguments:

xlsx_filename - a Studylog Excel Absolute TV output file
sheetname     - the name of the sheet to extract from
                Default: 'Absolute TV'

Returns:

a python dictionary of {name:dataframe} where name is the
title of the experimental group and dataframe is a pandas
data frame with columns for each individual and rows for
volume measurements at a given time point
Note: Raw days are returned - use standardise_days to fix
def studylog_absolute_to_tv_tables(xlsx_filename,
                                   sheetname='Absolute TV',
                                   header_length=5): #pragma no cover
    """A function for converting study log Prism format Excel files
    to dataframes.
    
    Arguments:

        xlsx_filename - a Studylog Excel Absolute TV output file
        sheetname     - the name of the sheet to extract from
                        Default: 'Absolute TV'

    Returns:

        a python dictionary of {name:dataframe} where name is the
        title of the experimental group and dataframe is a pandas
        data frame with columns for each individual and rows for
        volume measurements at a given time point
        Note: Raw days are returned - use standardise_days to fix
    """
    absolute_df = pandas.read_excel(xlsx_filename,
                                    sheetname=sheetname,
                                    header=header_length)
    return studylog_absolute_df_to_tv_tables(absolute_df)

def studylog_prism_df_to_tv_tables(

df)

abstracted from studylog_prism_to_tv_tables to allow sane testing Use studylog_prism_to_tv_tables

def studylog_prism_df_to_tv_tables(df):
    """abstracted from studylog_prism_to_tv_tables to allow sane testing
    Use studylog_prism_to_tv_tables"""
    start_of_tv = df.loc[df[0] == 'Tumor Volume (All Animals)'].index[1]
    end_of_tv = df.loc[df[0] == 'Scatterplot information for Prism'].index[0]
    tv_tables = []
    for x in split_on_nans(df[start_of_tv+1:end_of_tv]):
        if len(x.index) > 3:
            cleaned = clean_tv_table(x)
            if not (cleaned[1].empty):
                tv_tables.append(cleaned)
    return dict(tv_tables)

def studylog_prism_to_tv_tables(

xlsx_filename, sheetname='PrismRaw')

A function for converting study log Absolute TV format Excel files to dataframes.

Arguments:

xlsx_filename - a Studylog Excel Prism output file
sheetname     - the name of the sheet to extract from
                Default: 'PrismRaw'

Returns:

a python dictionary of {name:dataframe} where name is the
title of the experimental group and dataframe is a pandas
data frame with columns for each individual and rows for
volume measurements at a given time point
def studylog_prism_to_tv_tables(xlsx_filename, sheetname='PrismRaw'): #pragma no cover
    """A function for converting study log Absolute TV format Excel files
    to dataframes.
    
    Arguments:

        xlsx_filename - a Studylog Excel Prism output file
        sheetname     - the name of the sheet to extract from
                        Default: 'PrismRaw'

    Returns:

        a python dictionary of {name:dataframe} where name is the
        title of the experimental group and dataframe is a pandas
        data frame with columns for each individual and rows for
        volume measurements at a given time point
    """
    df = pandas.read_excel(xlsx_filename, sheetname=sheetname, header=None)
    return studylog_prism_df_to_tv_tables(df)