QMUL_Final_Project/Project_Writeup.tex

\documentclass[titlepage, 12pt]{scrartcl} \usepackage{enumitem}

\usepackage[british]{babel}
\usepackage[style=apa, backend=biber]{biblatex}
\DeclareLanguageMapping{british}{british-apa}
\usepackage{url}
\usepackage{float}
\usepackage{ragged2e}
\usepackage{caption}
\usepackage{multicol}
\newcommand{\tabitem}{~~\llap{\textbullet}~~}
%\restylefloat{table}
\usepackage[table]{xcolor}
\usepackage{multirow}
\usepackage{perpage}
\MakePerPage{footnote}
\usepackage{abstract}
\usepackage{graphicx}
\usepackage{setspace}
% Create hyperlinks in bibliography
\usepackage{hyperref}
\usepackage{amsmath}
\usepackage{booktabs}
\usepackage{tabulary}

\usepackage[pass]{geometry}
\usepackage{pdflscape}
\usepackage{graphicx}

\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage{blindtext}
\setkomafont{disposition}{\normalfont\bfseries}

\usepackage{etoolbox}

\usepackage{titlesec}
\setcounter{secnumdepth}{4}

\titleformat{\paragraph}
{\normalfont\normalsize\bfseries}{\theparagraph}{1em}{}
\titlespacing*{\paragraph}
{0pt}{3.25ex plus 1ex minus .2ex}{1.5ex plus .2ex}

\graphicspath{{./resources/}}
\addbibresource{~/Documents/library.bib}

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
% Fix for medeley's rubbish underscore handeling in generated bib files
\DeclareSourcemap{
    \maps{
        \map{ % Replaces '{\_}', '{_}' or '\_' with just '_'
            \step[fieldsource=url,
                  match=\regexp{\{\\\_\}|\{\_\}|\\\_},
                  replace=\regexp{\_}]
        }
        \map{ % Replaces '{'$\sim$'}', '$\sim$' or '{~}' with just '~'
            \step[fieldsource=url,
                  match=\regexp{\{\$\\sim\$\}|\{\~\}|\$\\sim\$},
                  replace=\regexp{\~}]
        }
        \map{ % Replaces '{\_}', '{_}' or '\_' with just '_'
            \step[fieldsource=url,
                  match=\regexp{\{\\\#\}|\{\#\}|\\\#},
                  replace=\regexp{\#}]
        }
    }
}

%\newsavebox{\abstractbox}
%\renewenvironment{abstract}
%  {\begin{lrbox}{0}\begin{minipage}{\textwidth}
%   \begin{center}\normalfont\sectfont\abstractname\end{center}\quotation}
%  {\endquotation\end{minipage}\end{lrbox}%
%   \global\setbox\abstractbox=\box0 }

%\makeatletter
%\expandafter\patchcmd\csname\string\maketitle\endcsname
%  {\vskip\z@\@plus3fill}
%  {\vskip\z@\@plus2fill\box\abstractbox\vskip\z@\@plus1fill}
%  {}{}
%\makeatother

\newcommand{\dtoprule}{\specialrule{1pt}{0pt}{1.4pt}%
            \specialrule{1pt}{0pt}{\belowrulesep}%
            }
\newcommand{\dbottomrule}{\specialrule{1pt}{0pt}{1.4pt}%
            \specialrule{1pt}{0pt}{\belowrulesep}%
            }

\DeclareCiteCommand{\citeyearpar}
    {}
    {\mkbibparens{\bibhyperref{\printdate}}}
    {\multicitedelim}
    {}

% MATLAB Code block stuff...
\usepackage{color}
\usepackage{listings}

\definecolor{dkgreen}{rgb}{0,0.6,0}
\definecolor{gray}{rgb}{0.5,0.5,0.5}

\lstset{language=Matlab,
   keywords={break,case,catch,continue,else,elseif,end,for,function,
      global,if,otherwise,persistent,return,switch,try,while},
   basicstyle=\ttfamily,
   keywordstyle=\color{blue},
   commentstyle=\color{gray},
   stringstyle=\color{dkgreen},
   numbers=left,
   numberstyle=\tiny\color{gray},
   stepnumber=1,
   numbersep=10pt,
   backgroundcolor=\color{white},
   tabsize=4,
   showspaces=false,
   showstringspaces=false}
\usepackage[shortcuts]{extdash}

\definecolor{codegreen}{rgb}{0,0.6,0}
\definecolor{codegray}{rgb}{0.5,0.5,0.5}
\definecolor{codepurple}{rgb}{0.58,0,0.82}
\definecolor{backcolour}{rgb}{0.95,0.95,0.92}

\lstdefinestyle{mystyle}{
   keywords={},
    numberstyle=\tiny,
    basicstyle=\scriptsize,
    breakatwhitespace=false,
    breaklines=false,
    captionpos=b,
    keepspaces=true,
    numbers=left,
    numbersep=5pt,
    showspaces=false,
    showstringspaces=false,
    showtabs=false,
    tabsize=2
}
\lstset{style=mystyle}

\begin{document}
\newgeometry{lmargin=1.5cm}
\begin{titlepage}

    \begingroup

    \setlength{\tabcolsep}{1.5cm}

    \begin{tabular}[c]{p{0.30\textwidth} | p{0.4\textwidth}}

    {\vspace{1.2cm} \Large School of Electronic Engineering and Computer Science \par}
    &
    {\vspace{1.2cm} \large Sound and Music Computing \newline Project Report \the\year \par}\\

    & {\vspace{0.5cm} \Large \textbf{Extraction of Audio Features from PCG Signals for the
Classification of Heart Abnormalities} \par}\\

    \vspace{0.4\textheight}
    \includegraphics[width=5cm]{qmul_logo}
    &
    {\vspace{1cm} \large \textbf{Samuel Perry}}\\

    &
    \multicolumn{1}{|r}{August \the\year}

    \end{tabular}

    \endgroup

\end{titlepage}
\restoregeometry

\doublespacing
\begin{abstract}
    A system is presented for the analysis and classification of pathological
    phonocardiogram signals. The project aims to provide a reliable and robust
    detection method to aid in the early detection of cardiovascular
    abnormalities. The project utilises a novel combination of feature
    extraction, optimisation and machine learning techniques to produce
    accurate results, demonstrating the clear potential for this methodology in
    the area of heart abnormality detection.
\end{abstract}

\renewcommand{\abstractname}{Acknowledgements}
\begin{abstract}
I would like to thank my supervisor, Dr.\ Tony Stockman, for his guidance and
support throughout the project. I am also extremely grateful for the advice
given by Robert Blaauboer. Finally, I would also like to thank John Thompson,
Tom Pengelly and David Pengelly for their valuable support.
\end{abstract}

\tableofcontents
\newpage

\section{Introduction}
Cardiovascular diseases are the most prevalent cause of death in Europe,
accounting for 37.5\% of all fatalities in 2013~\parencite{Eurostat2016}.
Traditionally, cardiac auscultation has been performed manually using a standard
stethoscope, with the aim of detecting heart defects aurally. This has been a
fundamental method for detecting heart valve disorders for over a century.
However, auscultation is a skill that requires training and can only usually be
performed by a medical professional, such as a GP. As a result, manual
auscultation is significantly susceptible to human error~\parencite{Hanna2002}.
Automation of this method using technology may provide a solution, and
recent research has shown promise in this area. A large amount of research has
focused on analysis of Electrocardiogram (ECG) signals.  Although useful for
detecting pathologies, ECG equipment is expensive and requires a trained
professional for use. Therefore it is not currently feasible for developing
countries and rural areas where there may be few physicians available. A
comparatively affordable and non-invasive alternative is the Phonocardiogram
(PCG)~\parencite[p.130]{Reed2004}. Typically recorded using an electronic
stethoscope, a PCG signal is a recording of sound made as the heart contracts,
analogous to the sound heard by physicians when performing cardiac auscultation
manually. Automated auscultation could provide an initial diagnosis for heart
defects without the need for a trained medical professional. This would allow
relatively cheap equipment to analyse a patient's heart sound, and
automatically recommend further inspection based on analysis.  By providing
earlier diagnosis of conditions that may have otherwise been overlooked, this
technology could have a significant impact on reducing mortality rates as a
result of heart conditions.

\section{Related work}
There are currently a wide variety of methods employed for the analysis and
classification of PCG signals. Current methods can typically be divided into 3
areas, each of which are combined to create a full classification system. These
areas are: signal preprocessing, signal segmentation, and feature
extraction/classification. The performance and evaluation of complete systems
are also discussed in section~\ref{Classification}


\subsection{Signal preprocessing}
There are a large number of factors that lead to variation in quality of PCG
recordings: stethoscope type, make and model, its microphone/sensors, the
position used to record (i.e.\ lower left sternal border, apex, pulmonic area,
aortic area), built in filters/signal processing used by the stethoscope (i.e.\
noise filters, anti-tremor filters), medication that a patient may be taking,
as well as many other aspects that may influence the recorded
signal~\parencite[p.4]{Pavlopoulos2004}. This presents a significant issue when
attempting to analyse and compare a database of signals, as variations in
recordings and artefacts caused by factors other than heart sounds will most
likely interfere with analysis and comparison methods. To account for this,
pre-processing methods are widely used, aiming to standardise a database. This
is also used as a way to accentuate features of the data that are expected to
be relevant for classification.\\

A common method employed is the use of decimation and static filters to remove
unwanted spectral content that is most likely noise~\parencite{Liang1997a,
Homsi2016, Springer2016, Gupta2007}. This helps reduce higher frequency noise
such as speech, microphone movement, breathing and other interference caused
externally. Signals are commonly downsampled to around 1--4KHz, with
anti-aliasing filter specifications varying across the literature. Generally,
highpass chebychev or butterworth filters are favoured with cutoff frequencies
ranging from 400--750Hz.\\

In addition, many methods decompose the filtered signal using wavelet based
methods, such as the discrete wavelet transform (DWT)~\parencite{Liang1997a,
Pavlopoulos2004}, continuous wavelet transform (CWT)~\parencite{Langley2016} or
wavelet package decomposition (WPD)~\parencite{Liang1998}. These methods are
commonly used to separate components of a signal based on their spectral
content. Wavelet transforms are popular as, unlike Fourier transforms, they
are well localised in both the time and frequency domain. This allows for the
analysis of PCG signals across multiple frequency bands whilst maintaining
transient temporal events in the resulting
decomposition~\parencite[p.93]{Ari2008}. This may be used for analysis of
transient events such as murmurs, that may consist of higher frequency
components than normal heart sounds.

\subsection{Signal segmentation}\label{Segmentation}
Algorithms for the segmentation of PCG data aim to extract the structure of the
signal over time. This is a key stage in the analysis of PCG signals, as the
structure of the signal and relationships between the fundamental heart sounds
(FHSs) form the basis for much of further analysis performed on PCG data.\\

% TODO: insert segmented graph of PCG cycle

A number of methods exist for the extraction of FHSs. Traditional methods rely
on direct extraction of peaks from amplitude envelopes in the time domain to
determine the structure of a signal.  These methods perform various
processing/transformations in order to accentuate the transient events with the
intention of isolating them.\\
Early work in this area by Liang et.\ al described a method using the popular
Shannon energy envelope, achieving good accuracy across 37 recordings of
children~\parencite{Liang1997b}. The algorithm aimed to segment the data by
first extracting the envelope, then applying adaptive rule based thresholds, to
determine peaks corresponding to segmentation points. When comparing results to
hand-annotated ground truth data, the system achieved a reported accuracy score
of 84\%. However, due to the small sample size, and potential lack of noise in
the database used, this may not translate well to a larger database recorded in
sub-optimal conditions.\\
More recent methods used spectral representations to assist in the splitting of
the FHSs, in particular using wavelet decomposition. These methods tend to
perform more robustly on signals of varying conditions.\\
Building on previous work, Liang et.\ al presented an improved method, using the
discrete wavelet transform to decompose and reconstruct the signal into 7
distinct frequency bands~\parencite{Liang1997a}. Applying a similar method
of envelope extraction and peak picking to each frequency band, the best
estimate of all frequency bands is then chosen as the final result. Criterion
for this choice is based on the number of S1s and S2s detected, and the number
of artefacts discarded for each frequency band. This method achieved an
improved accuracy of 93\% across a larger database of 77 recordings. This
suggests that the algorithm is as robust if not more so than previous work by
Liang et al.\\

Vepa et al.\ proposed a wavelet decomposition based method that uses a
combination of simplicity and envelope features~\parencite{Vepa2008}. This
approach attempts to improve robustness when analysing signals of varying
quality by using multiple complimentary features. This allows the method to base
decisions on a variety of statistical properties. Evaluating the algorithm on a
collection of 160 heart cycles from a variety of sources, a reported accuracy
of 84\% was achieved.\\

More recently, a variety of machine learning methods have been implemented with
reasonable success. Gupta et al.\ presented a method that applies $k$-means
clustering to replace standard threshold-based methods for determining peak
classification in a segmentation algorithm based on standard
envelopes~\parencite{Gupta2007}. This achieved a reported accuracy of 90.29\%.
Due to the envelope-based method for feature extraction, this method is still
susceptible to noise and artefacts that occur within the frequency bands of the
heart sounds.\\

Sepehri et al.\ proposed a method that combines neural networks with Power
Spectral Density (PSD) estimates~\parencite{Sepehri2010}.  By exploiting the
periodic nature of S1 and S2 heart sounds, combined with their narrow frequency
range, a neural network is trained to separate these sounds from other events,
such as noise and murmurs. This method achieved a reported 93.6\% accuracy on a
significantly larger database than previous methods detailed.\\

The most significant successes in segmentation algorithms have been observed through use
of probabilistic models such as Hidden Markov Models (HMMs). Early research
using these models by Ricke et al.\ utilised embedded HMMs to model the 4
states of the PCG and their transitions~\parencite{Ricke2005}. MFCCs and
Shannon Energy were used as feature vectors for the models. Results of
98\% accuracy were reported, although this was tested on only a small database
of signals.\\
Gill et al.\ achieved similar results, most notably with specific consideration
for the duration of each state in the HMM~\parencite{Gill2005}. This is
handled through the extraction of 6 duration features based primarily on peaks.
These features form vectors for training the HMM. Results of 98.6\%
sensitivity, 96.9\% positive predictivity for S1 sounds and 98.3\% sensitivity,
96.5\% positive predictivity for S2 sounds is reported.
The issue of state duration was further addressed by Schmidt et.\ al through use
of a Duration-dependent Hidden Markov Model (DHMM)~\parencite{Schmidt2015}. The
DHMM is a modified HMM that considers the duration of the current state when
calculating the probability of transition to another state. This modification
scored a reported sensitivity of 98.8\% and a positive predictivity of
98.6\%.\\
Building on previous work using HMMs, Springer et al.\ presented a segmentation
algorithm by using Hidden Semi-Markov Models (HSMMs) in combination with
logistic regression~\parencite{Springer2016}. Use of Hidden semi markov model
allows for a priori information on the duration of the current state to be used
in probability calculation of the subsequent state. In this case, the knowledge
that there is an upper and lower limit on the duration of each component is
used in the calculation of transition probabilities.  A modified viterbi algorithm
is then used to calculate the most likely set of transitions based on observed
features. Logistic regression is used to improve discrimination between state
features when compared to discriminatory methods used by previous work.
Performance was evaluated on a significantly larger database than previous
methods and achieved a reported accuracy of $95.63\% \pm 0.85\%$. Due to it's
rigorous evaluation and high accuracy, this method is currently considered the
state-of-the-art for PCG signal segmentation.\\

Table~\ref{SegmentationTable} provides a brief overview of significant research
into PCG segmentation. For a more complete summary of the current state of PCG
segmentation, please refer to Liu et.\ al~\parencite{Liu2016}

\newgeometry{margin=1cm} % modify this if you need even more space
\begin{landscape}
%\centering
\begin{table}[htbp]
\captionof{table}{Summary of Segmentation Algorithms} \label{SegmentationTable}
\scriptsize
\rowcolors{1}{gray!15}{white}
\doublespacing
\begin{tabulary}{\linewidth}{LLLLL}
\dtoprule
Author                 & Method                                                                                         & databases                                                                                       & \mbox{Reported} Results         & Notes                                                                                            \\ \bottomrule
Springer et.\ al \citeyearpar{Springer2016} & HSMM, Logistic regression                                                                       & 10,172s of recordings from 112 patients. 12,181 first and 11,627 second heart sounds. & $95.63\pm0.85\%$                             & Supervised algorithm.                                                                                                                                                            \\
Huiying et.\ al \citeyearpar{Liang1997b} & Normalised average Shannon energy envelope, peak picking                                        & 37 recordings, 14 pathological murmurs and 23 physiological murmurs. 515 cycles       & $91.03\%\;Ac$                                          & Unsupervised Algorithm.  database consists entirely of child recording. Optimized on full database                                                                                 \\
Vepa et.\ al \citeyearpar{Vepa2008}     & Wavelet decomposition, energy and simplicity measurement                                       & 160 heart cycles collected from a variety of sources (training CDs, web resources)    & $84\%\;Ac$                                             & Unsupervised Algorithm, Optimized on full database                                                                                                                                \\
Sun et.\ al \citeyearpar{Sun2014}             & Viola integral envelope extraction, short-time modified Hilbert transform, peak picking        & 6949s of recordings, from 121 patients                                                & $97.37\%\;Ac$                                          & Supervised algorithm. Tolerance for segmentation accuracy not specified                                                                                                          \\
Sepehri et.\ al \citeyearpar{Sepehri2010}        & Spectral density estimation, auto-regressive parameters, multi-layer perceptron neural network & 120 recording, from 60 patients                                                       & $93.6\%\;Ac$                                           & Supervised algorithm                                                                                                                                                             \\
Ricke et.\ al \citeyearpar{Ricke2005}    & Shannon energy (and related features), HMM                                                     & 9 recordings, from 9 patients                                                         & $98\%\;Ac$                                             & Supervised algorithm                                                                                                                                                             \\
Schmidt et.\ al \citeyearpar{Schmidt2015}  & DHMM, Auto-correlation duration features, Homomorphic envelogram                               & 113 recordings, from 113 patients. 8s per recording. 15 abnormal recordings           & $98.8\;Se,\;98.6\;P_+$ on test set                         & All data recorded ``lateral to the sternum in the fourth intercostal space on the left side''. Mix of noisy and clean recordings. 40 recording used for training, 73 for testing \\
Gill et.\ al \citeyearpar{Gill2005}         & Homomorphic envelogram, Embedded HMMs                                                          & 44 recording, 17 subjects. 30-60s per recording                                       & $98.6\%\;Ac, 96.9\;P_+$ for S1. $98.3\;Ac,\;96.5\;P_+$ for S2 & Recording taken in sub-optimal environments (noisy hospitals, offices etc...)                                                                                                    \\
Gupta et.\ al \citeyearpar{Gupta2007}    & Homomorphic filtering, $k$-means clustering                                                       & 41 patients, 340 heart cycles. 110 normal,  124 systolic murmur, 106 diastolic murmur & $90.29\%\;Ac$                                          & Unsupervised Algorithm.                                                                                                                                                          \\ \hline
\dbottomrule\\
\end{tabulary}
$Ac = \text{Accuracy}, Se = \text{Sensitivity}, P_+ = \text{Positive predictivity}$
\end{table}
\end{landscape}
\restoregeometry


\doublespacing

\subsection{Feature extraction/classification models}\label{Classification}

A wide variety of methods exist for the extraction of statistical features and
classification of PCG data. Most notably, the range of methods that were
developed as a result of the recent Physionet/Computing in Cardiology (CinC)
Challenge 2016 have improved the quality of abnormality classification in
noisy signals. The challenge was assembled to provide researchers with a large
database of normal/pathological PCG signals of varying quality. This enabled
the development of algorithms that could be evaluated on a significant
database, in order to determine performance across a range of conditions/signal
qualities~\parencite{Clifford2016}. This section first details significant work
produced prior to the challenge, then highlights key works produced for the
challenge to outline the breadth of methods for robust heart sound analysis.

\subsubsection{Work prior to the Physionet challenge}
Work prior to the Physionet challenge was conducted predominantly with the aim
of classifying specific heart conditions. Until recently, little research had
been produced with regards to general abnormality detection, with many projects
choosing to focus on specific conditions such as murmurs, atrial fibrillation,
flutter, and heart valve disease. This section outlines some key research
into these areas, alongside initial research into general abnormality
detection.\\

Reed et al.\ implemented a simple general classification algorithm using
artificial neural networks (ANNs) and wavelet
decomposition~\parencite{Reed2004}. As initial work into this field,
preprocessing such as segmentation is not performed and features remain
relatively simple when compared to more recent methods. Also, due to the
comparatively small sample size used for training (1 patient per abnormality, 4
cycles per patient), a reported accuracy of 100\% would have a strong
possibility of generalising poorly. This does however, serve as an early
example of limited success in general heart sound classification.\\

Maglogiannis et al.\ presented a classifier for discrimination of heart valve
disease from regular heart sounds using an SVM (Support Vector Machine)
classifier~\parencite{Maglogiannis2009}.  Roughly 100 features were extracted
from the signal, based on direct analysis of each heart cycle component (S1,
Systole, S2, Diastole) and the average Shannon energy envelope of these
components.  A database of 198 heart sounds was curated for the project,
acquired from 8 sources, such as medical CDs and pre-existing databases.  An
accuracy of 91.43\% was reported using 10-fold stratified cross-validation.  In
addition, the project aimed to classify individual abnormalities in a 3 step
process, by distinguishing between systolic or diastolic murmurs, and then
distinguishing between aortic or mitral diseases. The classifier achieved
accuracy between 90-97\% for these classifications. This approach demonstrates
the potential for a system to accurately distinguish between normal and
abnormal heart sounds in a generalisable way, given carefully selected
features.\\

Ari et.\ al also proposed an SVM based method for abnormality
classification~\parencite{Ari2010}. A modified Least-squares SVM (LSSVM) is
used in order to improve separability between normal and abnormal datapoints
during training. 32 wavelet based features from previous literature are used as
feature vectors for a modified LSSVM, un-modified LSSVM and a standard SVM.
Comparison of the system shows that the proposed technique performs
significantly better on all test sets with an accuracy of between 86\% and
100\%, dependent on database. This highlights the importance of choosing an
appropriate classification method for achieving accurate results.\\

Quiceno-Manrique et al.\ demonstrate the use of various time frequency
representations (TFR) such as short-time Fourier transform, wavelet transforms,
Wigner-Ville distribution etc\ldots, with a $k$-nearest neighbour classifier
(k-NN) for systolic murmur detection~\parencite{Quiceno-Manrique2010a}. This
work highlights the effectiveness of alternative TFRs to traditional Fourier
methods. This method also employs Principle Component Analysis (PCA), ie.\ the
mapping of a high dimensional feature space to a lower dimension, in order to
improve computational performance. Features were evaluated using a database
of of 22 patients, 6 of which were labeled as having a systolic murmur. The
highest reported accuracy was achieved using MFCCs as the primary feature
vector achieving a 98\% accuracy on 10-fold cross validation.\\

Schmidt et al.\ aimed to find features that could be used for classification of
coronary artery disease through detection of small
murmurs~\parencite{Schmidt2015}. A large number of features are
calculated to provide vectors for classification; Parametric spectral features
such as ARMA are used, alongside instantaneous frequency and octave power
measurements. Complexity features, such as sample entropy and simplicity, are
also calculated in an attempt to exploit the likely stochastic nature of
murmurs when compared to normal heart sounds.  Given the large number of
features calculated, PCA is used to retain only the most relevant information.
Quadratic discriminant analysis (QDA) is then used as a classifier to provide a
final accuracy score of 73\%.\\

An overview of significant research prior to the Physionet challenge is
provided in table~\ref{SumPrior}. It is also noted that none of the databases
used for prior research are publicly available.


\newgeometry{margin=1cm} % modify this if you need even more space
\begin{landscape}
\begin{table}[htbp]
    \captionof{table}{Summary of research prior to the Physionet Challenge 2016}\label{PriorWorkTable}
\scriptsize
%\centering
\rowcolors{1}{gray!15}{white}
\label{SumPrior}
\doublespacing
\begin{tabulary}{\linewidth}{LLLLLL}
\dtoprule
Author                   & Pre-processing/segmentation                                                                                                               & Features                                                                                                        & Classification Method & Database                                                                                                                 & Reported Accuracy                                  \\ \hline
Maglogiannis et.~al \citeyearpar{Maglogiannis2009}     & Wavelet decomposition, Shannon energy peak picking                                                                                        & Features derived from wavelet decomposition and PCG segmentations                                               & SVM                   & 198 recordings, 38 normal, 41 AS systolic murmur, 43 MR systolic murmur, 38 AR diastolic murmur, 38 MS diastolic murmur & $91.43\%\;Ac$                                      \\
Ari et~al. \citeyearpar{Ari2010}              & Amplitude envelope peak picking~\parencite{Ari2007}                                                                                       & Wavelet based features                                                                                          & LSSVM                 & 64 patients, 64 recordings, 512 cycles                                                                                  & $88.750-100\%\;Ac$ (dependant on abnormality type) \\
Quiceno-Manrique et~al. \citeyearpar{Quiceno-Manrique2010a}& Downsampled to 4KHz, Normalised to maximum of signal, ECG assisted QRS complex detection algorithm used for segmentation                  & Spectral features derived from STFT, Wavelet decomposition and quadratic energy distributions                   & $k$-NN                & 22 patients, 16 normal, 6 abnormal, 8 recordings (12s) per patient                                                      & $98\%\;Ac$                                         \\
Schmidt et~al. \citeyearpar{Schmidt2015}          & Signal filtered into frequency bands, Segmented by HMM based method+hand corrected, removal of high variance sub-segments to remove noise & Parametric spectral features (AR, ARMA and Music), Instantaneous frequency and amplitude, Power in octave bands & QDA                   & 435 Recordings, 133 patients, 70 normal, 63 abnormal                                                                    & $73\%\;Ac$                                         \\
Reed et~al. \citeyearpar{Reed2004}             & ---                                                                                                                                       & Wavelet decomposition coefficients, Manual feature reduction                                                    & ANN                   & 5 patients, 4 cycles per patient                                                                                        & $100\%\;Ac$                                        \\
\dbottomrule\\
\end{tabulary}
$Ac = \text{Accuracy}$
\end{table}
\end{landscape}
\restoregeometry

\subsubsection{Physionet challenge entries}\label{ChallengeEnt}
\doublespacing
The 2016 Physionet/CinC Challenge aimed to encourage development of heart
abnormality detection algorithms by providing a large open database of PCG
signal recordings, sourced from a variety of both clinical and non-clinical
environments. (Further details on the database can be found in
section~\ref{Database}. The complete specification is presented by Liu et al.~\parencite{Liu2016}). In addition, participants were provided with a
state-of-the-art heart sound segmentation algorithm, as proposed by Springer
et al.\ in Section~\ref{Segmentation}. Participants were then tasked with the
creation of a classification algorithm that could robustly discriminate between
healthy and unhealthy heart sound samples. The challenge received 348 entries
in total, each of which was scored on a hidden test database
using a Modified accuracy measure ($Acc$) as defined by Clifford et.
al~\parencite{Clifford2016}:
\begin{table}[htbp]
\centering
\caption{Output Classification}
\label{OutputClassification}
\doublespacing
\begin{tabular}{llccc}
\hline
                              &                 & \multicolumn{3}{c}{Algorithm's Output}                                                    \\ \hline
                              &                 & \multicolumn{1}{l}{Normal} & \multicolumn{1}{l}{Uncertain} & \multicolumn{1}{l}{Abnormal} \\
\multirow{4}{*}{Ground Truth} & Normal, clean   & $Nn_1$                     & $Nq_1$                        & $Na_1$                       \\
                              & Normal, noisy   & $Nn_2$                     & $Nq_2$                        & $Na_2$                       \\
                              & Abnormal, clean & $An_1$                     & $Aq_1$                        & $Aa_1$                       \\
                              & Abnormal, noisy & $An_2$                     & $Aq_2$                        & $Aa_2$                       \\ \hline
\end{tabular}
\end{table}

\doublespacing

Weights are calculated as:
\begin{table}[H]
\centering
\doublespacing
\begin{tabular}{ll}
$Wa_1 = \frac{\text{Clean abnormal recordings}}{\text{Total abnormal recordings}}$ & $Wa_2 = \frac{\text{Noisy abnormal recordings}}{\text{Total abnormal recordings}}$ \\
$Wn_1 = \frac{\text{Clean normal recordings}}{\text{Total normal recordings}}$     & $Wn_2 = \frac{\text{Noisy normal recordings}}{\text{Total normal recordings}}$
\end{tabular}
\end{table}

Modified sensitivity ($Se$), specificity ($Sp$) and overall accuracy ($Acc$) are then calculated as:

\begin{align*}
    &Se=Wa_1\frac{Aa_1}{Aa_1+Aq_1+An_1}+Wa_2\frac{Aa_2+Aq_2}{Aa_2+Aq_2+An_2} \\
    &Sp=Wn_1\frac{Nn_1}{Na_1+Nq_1+Nn_1}+Wn_2\frac{Nn_2+Nq_2}{Na_2+Nq_2+Nn_2} \\
    &Acc=\frac{Se+Sp}{2}
\end{align*}

This section summarises some of the key works presented for the challenge,
including some of the most accurate models, and a baseline classifier
provided to participants as a starting point.\\

This baseline classifier was provided in order to demonstrate the basic
structure of systems expected for entries~\parencite{Liu2016}. The classifier
extracted a selection of 20 basic features, primarily focused on relative
timings and amplitudes of heart sounds.  A binary logistic regression model was
chosen for classification. From the 20 extracted features, 13 were selected
based on their statistical significance, measured using forward likelihood ratio
selection. The system achieved a reported score of 66\% on the test set, giving
a baseline score for participants to build on.  In addition, the system was
trained using leave-one-out cross validation. By removing a single training
database on each fold, the generalisation of the algorithm could then be
evaluated, training on all other databases. Results showed that performance
decreased significantly when training via this method, giving an average
accuracy of 59\%, with training database $b$ scoring as low as 47\%.  This
could suggest that individual databases in the database are not sufficiently
represented by other databases, or that features do not model abnormalities
sufficiently.\\

Homsi et.\ al proposed a system that utilised 131 time domain, STFT-based and
wavelet-based features which, when combined with nested ensemble classifiers,
produced an accuracy score of 84.48\%~\parencite{Homsi2017}. This algorithm
combines many commonly-used features in previous PCG related literature such as
wavelet decomposition based features, MFCCs and Shannon Energy. The system also
uses a total of 40 classifiers, 20 for signals labelled to be `standard' and 20
for those labelled as `atypical'. A mixture of Random Forrest, LogitBoost and
Cost-Sensitive Classifiers (CSC) are used to classify signals in parallel.
Final results are combined using a rule-based decision, designed through manual
experimentation.\\

Potes et al.\ presented a similar approach to that of Homsi et
al.~\parencite{Potes2016}. 124 similar TFR features were extracted and used as
vectors for an AdaBoost classifier. This classifier was then combined with a
deep learning approach using a Convolutional Neural Network (CNN) classifier.
The signal was decomposed into 4 frequency bands and segmented, to provide
input to the CNN. Results from both AdaBoost and CNN classifiers were then
combined using a set decision rule.  This method produced the highest score on
the test set for the challenge at 86.02\%.\\

Zabihi et al.\ took an alternative approach by choosing not to segment PCG data
in the pre-processing stage~\parencite{Zabihi2016}. This was with the intention
of reducing computational complexity of the resulting algorithm. In addition,
the proposed method utilizes a sequential forward feature selection (SFS) and
Linear Predictive Coefficients (LPC) for the reduction of features used for
classification. This benefits the system by removing correlated and irrelevant
features, thus reducing computational complexity and removing irrelevant noise
from feature vectors prior to training.  Final classifications are determined
through cascaded ensembles of ANNs. The signal is first classified as either of
high or low sound quality, and then as normal or abnormal. The system achieved
a final score of 85.9\% on the hidden test set.\\

Plesinger et al.\ opted to develop a new form of machine learning algorithm
based on probability assessment~\parencite{Plesinger2017}. In this method,
features are mapped to histograms and thought of as probability distributions.
Weights are applied based on the number of occurrences of each feature, which in
turn generates a probability function. This can then be used to calculate the
estimated classification of a new data point. From the 228 extracted features,
53 features were then selected based on calculated sensitivity and specificity
scores using generated histograms. This allowed for the training scores to be
automatically optimized by the algorithm.\\

Kay et al.\ present a method using ANNs, a wide variety of features and PCA for
feature reduction~\parencite{Kay2017}. The algorithm scores well on the test
set. However, this work is most notable for it's rigorous evaluation by
authors, using leave-one-out cross validation for a clearer understanding of
the generalisation of the algorithm, as well as highlighting issues with the
underlying database that are discussed in Section~\ref{Database}


\newgeometry{margin=1cm} % modify this if you need even more space
\begin{landscape}
\begin{table}[H]
    \captionof{table}{Summary of top 10 Physionet Challenge 2016 entries}
    \label{PhysionetTable}
\scriptsize
%\centering
\rowcolors{1}{gray!15}{white}
\doublespacing
\begin{tabulary}{\linewidth}{CCCCC}
\dtoprule
Author                                           & Features                                                                        & Classification Method                                    & Reported Scores                                                                                      & Challenge Score \\ \midrule
Potes et.~al \citeyearpar{Potes2016}             & 124 TFR features                                                                & Combined AdaBoost/ANN                                    & In-house test set accuracy: AdaBoost-abstain: 79\%, CNN: 82\%, Combined classifiers: 85\%            & 86.02\%         \\
Zabihi et.~al \citeyearpar{Zabihi2016}           & 40 temporal, spectral and TFR features, reduced using SFS and LPC               & 2 ensembles of neural networks                           & Training accuracy: Maximum of 91.50\%                                                                & 85.90\%         \\
Kay et.~al \citeyearpar{Kay2017}                 & CWT, MFCCs, complexity measures, Inter-beat features, PCA                       & ANNs                                                     & A range of cross validation based tests were used to analyse performance. See paper for full details & 85.20\%         \\
Bobillo \citeyearpar{Bobillo2016}                & MFCCs and WPD, reduced using tensor decomposition                               & $k$-NN                                                   & A range of cross validation based tests were used to analyse performance. See paper for full details & 84.54\%         \\
Homsi et.~al \citeyearpar{Homsi2017}             & 131 time domain, STFT based andwavelet based features                           & Combined ensembles of LogitBoost, Random Forrest and CSC & Training accuracy 87.7\%, In-house test accuracy: 93.24\%                                            & 84.48\%         \\
Maknickas et.~al \citeyearpar{Maknikas2017}      & MFCCs, reduced by Karhunen–Loeve transform                                      & Deep Neural Network                                      & Training accuracy 99.7\%, Validation accuracy 95.2\%                                                 & 84.15\%         \\
Plesinger et.~al \citeyearpar{Plesinger2017}     & Statistical and symmetry properties of amplitude envelopes for S1 and S2 sounds & Custom probability assessment machine learning algorithm  & Training accuracy 90.3\%                                                                             & 84.11\%         \\
Rubin et.~al \citeyearpar{Rubin2016}             & MFCCs                                                                           & Convolutional neural networks                            & --                                                                                                   & 83.99\%         \\
Jiayu (paper not submitted)                      & --                                                                              & --                                                       & --                                                                                                   & 82.82\%         \\
Abdollahpur et.~al \citeyearpar{Abdolahpur2017} & time, TFR and perceptual features, reduced using Fisher's discriminant analysis & Combined ANNs                                            & Training accuracy: 91.6\%, 87\%, 84.55\% (prior to ANN combination method)                           & 82.63\%\\
\dbottomrule\\

\end{tabulary}
\end{table}
\end{landscape}
\restoregeometry
\doublespacing
\section{Database}\label{Database}
%TODO: Briefly describe what is needed from a database for this project
A database representative of real-world PCG signals was needed to train models
and evaluate the proposed method effectively.  A number of criteria were
identified as necessary for the success of the proposed project:
\begin{itemize}
    \item The database must contain sufficient PCG data, so that a model
        trained to discriminate between said signals would, in theory, generalise
        to new PCG data.
    \item The database should contain a mixture of clean and noisy signals that
        represent a variety of real world situations, as real-world
        classification would likely be performed in sub-optimal conditions. If
        this is not possible, noise could potentially be added to clean signals
        to simulate this.
    \item It must be possible to differentiate healthy signals from a variety of
        individual pathologies, in order to provide a general abnormality
        detection algorithm. This should be reflected in the database through
        inclusion of a variety of signals representing different pathological
        heart conditions.
    \item Data must be reliably labelled in order to generate a reliable model
        (particularly when using machine learning methods, as in the proposed
        project). Labels should ideally be verified by a trained professional.
\end{itemize}
\noindent
Two viable options were then considered based on the above criteria:
\begin{enumerate}
    \item The Physionet challenge database
    \item Generation of a synthetic dataset via methods such as that proposed
    by Almasi et al.~\parencite{Almasi2011}
\end{enumerate}

Generation of synthetic data was considered, as few well-formed alternative
databases exist, other than the Physionet challenge data. The database curated
for the Physionet challenge was selected for this project, as it fulfilled the
criteria sufficiently and posed less of a risk in terms of signal quality, due
to all signals being produced in real-world environments. However, synthesis
of PCG data remains an interesting possibility for improving evaluation of
classification systems and could be considered for the generation of additional
samples in future work.

\subsection{Database summary}
The selected database is significantly larger and contains a wider variety of
signal conditions than any database used for previous research (as detailed in
table~\ref{PriorWorkTable}). It is released as an open-source resource and is
documented in significant detail by Liu et al.~\parencite{Liu2016}. The lack of
any alternative databases, comparable in size or variety of content, perhaps
makes this resource the current standard for PCG analysis projects. In
addition, by replicating the conditions of the Physionet challenge, results can
be directly compared with those of the challenge participant's, with the aim of
understanding how the proposed algorithm compares to the current state of PCG
analysis.

\begin{itemize}
    \item The database consists of 6 sub-databases, labelled $a$ to $f$.
    \item These sub-databases have been sourced from a variety of professionals,
        over the course of a decade.
    \item A total of 3,240 recordings are included, created using varying equipment.
    \item 2575 recordings are labelled as normal, 665 are labelled as abnormal.
    \item All samples have been resampled to 2KHz
    \item Samples were recorded in a range of both clinical and
        non-clinical environments.
    \item Many recordings are corrupted with environmental noise, such as
        microphone friction, breathing, talking etc\ldots
    \item Sections of silence are present in some recordings, most
        significantly in database $e$.
\end{itemize}

\subsection{Considerations}\label{DBCons}
There are a number of issues with the acquired database that have been
highlighted, both through previous literature and through development of the
proposed system. These have been considered throughout development and evaluation of
the project.\\
A significant issue highlighted by Liu et al.\ is the large number of normal
recordings compared to pathological recordings. This creates a clear class
imbalance issue that can result in over-inflated classification
results. This is considered in
Section~\ref{Resample}.\\
Another key issue is the difference between the databases used by participants of the
Physionet challenge, and the available data that was acquired for this project.
For unknown reasons, information such as patient labels used for training many
of the challenge participant's models have not been made publicly available and
so could not be used for training of the proposed system.\\
The lack of access to the hidden test set used for evaluating challenge entries
also had a significant impact on evaluation. An alternative method for
evaluating using only the data provided has been proposed in
Section~\ref{metrics}.\\
Finally, an issue is highlighted by Bobillo with regards to database
$e$~\parencite{Bobillo2016}. The recording of normal and pathological signals using
separate devices is likely to cause issues and is discussed in
Section~\ref{Eval}.

%BEGIN NEW MATERIAL
\pagebreak
\section{Design}
This project aims to provide robust heart abnormality detection for PCG
signals, such that use of the system could reliably recommend further medical
attention when necessary. It is clear from previous research that machine
learning methods for classification have shown the most promise in this area,
and that ensemble methods have been largely successful in improving
classification accuracy of base classifiers~\parencite{Homsi2017, Potes2016}.
However, one such method that has recently shown significant success in other
fields, but is not present in recent PCG analysis literature, is the stacking
meta-classifier~\parencite[p.498]{Tobergte2013a}. The presented system was
therefore designed to explore the potential of this classification method in
the context of PCG signal classification. This section details the four key
components developed to form the final system: signal preprocessing
(Section~\ref{preprocessing}), audio feature extraction (Section~\ref{featEx}),
classification (Section~\ref{class}) and optimisation (Section~\ref{optimise}).

% TODO: Create flow diagram Preprocessing -> Feature extraction ->
% Model optimisation -> Performance evaluation

\subsection{Preprocessing}\label{preprocessing}
It quickly became apparent that, due to significant variations in the available
data (as a result of noise, variations in recording equipment etc...), the
effective preprocessing of such data would be a critical factor when designing
the system. This section details the most significant preprocessing steps
taken, in order to both minimize noise and extract the basic structure of the
signal.

\subsubsection{Signal decimation}
A common method employed to simultaneously reduce computation time and remove
extraneous information is to decimate the input signal by an integer factor.
According to Shannon sampling theorem, a digital signal can only represent
frequency content up to half the sample rate of the signal (the nyquist
rate)~\parencite[p.140]{Kadis1999}
Therefore, by removing every $n$th sample, high frequency content can be
removed whilst lowering the number of samples that must be processed in
subsequent operations. An anti-aliasing filter must also be applied to the
signal in order to filter harmonic distortion generated by the process.
As it is commonly stated in the literature that little relevant information in
PCG signal is found above 400Hz, all signals were resampled to 1KHz giving a
500Hz cutoff frequency, using an 8th order zero-phase Chebyshev type I filter.

\subsubsection{Dataset resampling}\label{Resample}
A common issue with data collected from the real world (ie. non-synthetic data)
is the imbalance of classes in data. As noted by Liu et
al.~\parencite{Liu2016}, this is the case with the available dataset, as there
are less pathological signals than healthy signals.  This presents an issue
with classification tasks, as imbalance can have a negative impact on
classification of the minor class. In this context, class imbalance could
potentially impact classification accuracy for abnormal samples, so must be
handled appropriately. This issue can be approached using a number of methods.
Sophisticated oversampling methods such as SMOTE (Synthetic Minority
Oversampling Technique) offer one solution. SMOTE generates synthetic samples
using interpolation and adds these to the data set to balance the classes,
without using direct copies of existing data. However, oversampling techniques
such as this can increase overfitting of models, and don't always offer
reasonable improvement in performance~\parencite{Longadge2013}.  Undersampling
is the most common method used, typically by randomly removing samples from the
major class. This has the obvious disadvantage of reducing data available for
training. However, an improved method using $k$-Means clustering has been shown
to be effective in previous cardiovascular classifications
problems~\parencite{Rahman2013}. This method was seen to be the best choice for
the proposed system. This method is illustrated using a small generated
2-dimesional dataset in Figure~\ref{cent}.

\begin{figure}[H]
    \caption[caption of centroid]{Example resampling of synthesised dataset using cluster centroids\footnotemark}
    \makebox[\textwidth]{\includegraphics[width=\textwidth]{centroid}}
    \label{cent}
\end{figure}
\footnotetext{This figure was adapted from: \url{http://contrib.scikit-learn.org/imbalanced-learn/stable/}}

\subsubsection{Signal segmentation}
With one notable exception~\parencite{Langley2016}, previous classification
algorithms rely heavily on the ability to segment signals into the four
fundamental heart sounds. This is a key prerequisite to the extraction of
relevant features. The defining of signal structure allows for the
relationships between it's components to be analysed, as described in
Section~\ref{featEx}. To facilitate the development of robust algorithms for
the Physionet challenge, participants were provided with an implementation of
Springer's HSMM based segmentation algorithm. As the highest scoring algorithm
in the literature, it was clearly the most suitable algorithm to use for the
proposed system. In addition to the high accuracy of segmentation, the wide
adoption of this algorithm is beneficial for comparison with other algorithms
submitted to the challenge. Results produced by the proposed system will
generally not be coloured by the differences in quality of segmentation
algorithms, allowing for more direct comparison of classification methods.
However, it is noted that despite the high performance of the algorithm, errors
in segmentation will still occur, that may have a negative impact on feature
quality. As methods proposed by previous literature, such as hand correction by
a professional~\parencite[p.2203]{Liu2016} are not feasible in this context,
and considering the low number of erroneous results produced by the
algorithm~\parencite[p.2]{Goda2016} it was decided that these errors would not
pose a significant problem. An illustration of PCG data segmentation can be
seen in Figure~\ref{segs}.

\begin{figure}[H]
    \caption{Example segmentation of PCG data}
    \makebox[\textwidth]{\includegraphics[width=\textwidth]{segs}}
    \label{segs}
\end{figure}


\subsection{Feature extraction}\label{featEx}
The extraction of feature vectors from data is a fundamental component of most
machine learning based systems. The aim is to construct meaningful
representations of the data that emphasize information relevant to the
classification problem. In the proposed project, 188 features were extracted
from the data, procuring feature extraction techniques from a wide range of
previous literature, as well as using novel perceptual features commonly found
in audio/music analysis (See Sections~\ref{FFT} and~\ref{Time}).
There are also potential issues that can occur when using large sets of
features for training. The method proposed for addressing these issues is
discussed in section~\ref{SFS}. This section provides a summary of the main
feature categories. Please refer to appendix~\ref{appendixA} for a full
breakdown of all features.

\subsubsection{Time-domain features}\label{Time}
A range of features were generated, based directly on the time series data.
Features such as:
\begin{itemize}
    \item Average and standard-deviation of segment intervals, for all heart
        sounds and complete heart cycles.
    \item Ratio of systolic and diastolic period to total heart cycle period.
    \item A range of statistical features such as entropy, skewness and variance for
        each heart sound.
    \item A selection of envelope based features for each heart sound.
\end{itemize}

18 features provided by the Physionet challenge focused on timings between
segments of the heart cycles. It was thought that these features would be
useful in capturing irregularities caused by conditions such as arrhythmias,
atrial septal defect and other conditions that are likely to affect relative
timing of heart sounds.
Many conditions that can be detected by traditional auscultation are
characterised by an increase in loudness of the S1 and/or S2 heart
sounds~\parencite{Brown2008}. This suggests that features relating to human
perception of loudness may aid in the detection of such conditions.  Simple
envelope based features such as RMS, peak loudness and the Shannon energy
envelope (Equation~\ref{ShanEQ}) that proved popular in previous literature,
were extracted for this reason~\parencite[p.73-77]{Lerch2012}. In addition,
statistical features such as sample entropy and skewness (Equation
~\ref{SkewEQ}) were used to evaluate the distribution of samples for each heart
sound, these were selected to provide a representation of the temporal
``shape'' of each sound.

\begin{equation}\label{ShanEQ}
    SE = \frac{-1}{N}\sum\limits_{n=0}^N x(n)^2\cdot \log{x(n)^2}
\end{equation}
\begin{equation}\label{SkewEQ}
    S=\frac{E(x-\mu)^3}{\sigma^3}
\end{equation}
Where:\\
$x(n)$ is the input signal\\
$E(t)$ is the expected value\\
$\mu$ is the mean of the signal\\
$\sigma^2$ is the variance of the signal

\subsubsection{FFT-based features}\label{FFT}
It was recognised that a time domain representation alone was unlikely to
provide a sufficient representation for discerning a wide variety of
conditions. Using a time-frequency representation to characterise the spectral
components of the signal has proven effective in the majority of literature.
The classic method for producing a spectral representation of a signal is to
apply the Discrete Fourier Transform (DFT) (as defined in Equation~\ref{FFTEQ})
over a sliding window of size $N$. By decomposing the signal into a series of
sine and cosine waves, a representation of the signal's spectral content across
a range of frequency bands is produced. This can be used for further analysis
of heart sounds based on their spectral characteristics.
\begin{equation}\label{FFTEQ}
X(k)=\sum\limits_{n=0}^{N}x(n)e^{\frac{-j2\pi kn}{N}}
\end{equation}
Where $x(n)$ is the input signal\\
Features generated using this representation would, in theory, be useful for
identifying conditions that reside in specific frequency bands, such as
murmurs, for example~\parencite{Sepehri2010}.\\

An example of such features are Mel-Frequency Cepstrum Coefficients (MFCCs).
Popular in speech processing, MFCCs provide a compact representation of a
signal's spectral shape. MFCCs are calculated by first applying $N$ (a
user-defined parameter) triangular filter banks, spaced using the mel scale to
the magnitude spectrum. Applying a discrete cosine transform to the log of the
filterbank outputs provides the final set of coefficients (for further details,
please refer to~\parencite{Lerch2012}). This analysis
creates a perceptually relevant representation of spectral shape, in effect
mimicking the way in which humans might perceive the spectral shape of heart
sounds. The reasoning for this is that, as the aim is to provide a system with
performance better than, or equal to to that of a human, features that mimic
what a human perceives may prove effective at distinguishing conditions in the
way that a human does. This has shown to be effective in previous literature,
with multiple systems utilising perceptual features with
success~\parencite{Ortiz2016, Rubin2016, Quiceno-Manrique2010a}. 13 MFCCs were
calculated for each heart sound and averaged to provide 13 features
per sample.\\
%TODO: Generate MFCC spectum

In addition to MFCCs, other statistical features were extracted from the
spectrum such as spread, skewness, kurtosis and flatness. These features aim to
provide alternate spectral measurements to MFCCs, in a similar way to their
temporal counterparts as described in Section~\ref{Time}.

Although the Fourier representation of PCG signals has proven effective in many
cases, there are drawbacks of this representation that must be considered. One
key issue that is inherent of Fourier transforms is the time-frequency
trade-off. An increase in frequency resolution will always result in a decrease
in temporal resolution. This poses a problem, as it is not possible to localize
transient events accurately in the frequency domain using this method. This
method may also suffer in the presence of background noise common in PCG
signals. Previous studies have shown that these factors may have a significant
impact when detecting conditions such as Coronary
Stenoses~\parencite{Ergen2001, Akay1990}

\subsubsection{Wavelet decomposition features}
The wavelet transform has been used effectively as an alternative
time-frequency representation to Fourier methods. The fundamental concept of
the wavelet transform is to represent an input signal as a set of scaled and
shifted finite oscillations. By comparing the signal with each scale of wavelet
at all points in time, a set of $N\times A$ (Where $A$ is the number of scales)
coefficients are generated. These define the scale and position needed for
each wavelet in order to fully reconstruct the signal (This is illustrated in Figure~\ref{wave}. For further details,
refer to~\parencite{Polikar1994}). The benefit of this transform is that it is
well localized in both time and frequency domains. This allows for accurate
representation of transient events such as clicks and snaps that are
characteristic of heart conditions such as Mitral valve prolapse or
stenosis~\parencite{Brown2008}.\\
For the proposed system, a 5 level DWT using daubechies wavelets-4 mother wavelet was
used for decomposition and reconstruction. Statistical features such as entropy
were then calculated, both on the reconstructed signal and directly on
coefficients to attain a total of 48 features.~\parencite{Homsi2016}
% TODO: Insert wavelet diagram here

\begin{figure}[H]
    \caption{Example 5 level daubechies 4 wavelet decomposition and reconstruction (normalised). Plots in descending order: D1, D2, \ldots, D5, A1}
    \makebox[\textwidth]{\includegraphics[width=1.0\textwidth]{wavelet}}
    \label{wave}
\end{figure}

\subsubsection{Feature scaling and imputing}
A common problem when working with multiple features is the difference in scale
between features. This problem can cause many machine learning algorithms to place
bias on larger scale features and can significantly impact the time taken for
certain algorithms to converge. This is particularly significant when applying
algorithms sensitive to feature scale such as SVMs (described in
Section~\ref{SVM}). To address this, a Min-Max scaler was applied
to training and test sets prior to training models. This scales all values to within a
0--1 range producing a set of features on a common scale.\\
It is also common to encounter missing values in features. These can occur as a
result of $\log(0)$ or division by 0 calculations, amongst other edge cases. A
standard method for handling these values is to apply an imputer, replacing
values with the mean of the feature vector~\parencite{VanderPlas2017}.

\subsection{Stacking classifier with cross-validation}\label{class}
The stacking classifier is an ensemble classifier, that uses the results of
multiple base classifiers as input to a 2nd level meta-classifier, which in
turn is used to generate a final prediction. $k$-fold cross validation is used
across base classifiers, training on $k-1$ folds of input data, and applying
to the remaining validation set. The results of these predictions from each
base classifier are combined and used to train the 2nd level classifier which
produces the final predictions based on the probabilities and predictions
provided. This is illustrated in figure~\ref{stack}\\
Given it's proven accurate performance across a range of tasks, it was
expected that this classification model could be applied effectively to produce
an alternative method for abnormality detection than those presented in
previous literature.
% TODO:Insert stacking classifier diagram

\begin{figure}[H]
    \caption[caption of stack]{Stacking classifier overview\footnotemark}
    \makebox[\textwidth]{\includegraphics[width=0.5\textwidth]{stacking_cv_classification_overview}}
    \label{stack}
\end{figure}
\footnotetext{Figure retrieved from:\url{http://rasbt.github.io/mlxtend/user_guide/classifier/StackingCVClassifier/}}

\subsubsection{Base classifiers}
Clearly, an important consideration when using any ensemble method is the
selection of the base classifiers. In order for any ensemble method to perform
well, it must be constructed using a selection of classifiers that individually
provide useful models for the data~\parencite[p.484]{Tobergte2013a}.   A wide
variety of models were considered for use as base and meta models including
models such as Tree based, $k$-Nearest Neighbour, and AdaBoost classifiers.
Selection of these models was based on a novel approach using hyperparameter
optimisation as discussed in Section~\ref{optimise}. The following sections
detail the 3 final models selected by the optimisation algorithm; A combination
of SVM and Naive-Bayes classifiers, with a Logistic Regression meta classifier.

\paragraph{SVM}\label{SVM}
The SVM classifier aims to fit a hyperplane to data that maximises the
separability between classes. This results in a model that has been shown to
generalise well in many cases, as maximising separability between classes is
also likely to increase the margin for error in separation of classes. This
type of classifier is also able to generate hyperplanes in non-linear space,
using a techniques known as `kernel tricks'. This works by mapping linear data
to a higher dimension, allowing non-linearly separable classes to be separated
by the same method. The details of the SVM and Kernel-SVM are complex and
outside the scope of this report. Further information can be found
in~\parencite[p.187]{Tobergte2013a}.\\
SVMs have been prevalent in previous literature, shown to be effective in
separation of a variety of heart conditions~\parencite{Ari2010} The use of
kernels to map parameters to higher dimensions is a key advantage of this
model, allowing for non-linear relationships that are likely to be present in
the large variety of features to be well represented in classification. Choice
of kernels, and relevant hyperparameters is detailed in Section~\ref{PSOp}.

\paragraph{Naive-Bayes}
Commonly used in text classification problems, where there is typically a
high-dimensional feature space, Naive Bayes classification uses Bayes rule to
determine the probability of classification, given a vector of features. This
is calculated as:
\begin{equation}
    P(y\mid x_1,\ldots,x_n)=\frac{P(y)\prod\limits_{i=1}^{N}P(x_i\mid y)}{P(x_1,\ldots,x_n)}
\end{equation}
The implementation used assumes a Gaussian distribution for all features,
calculating the probability of a feature as:
\begin{equation}
    P(x_i\mid y)=\frac{1}{\sqrt{2\pi
    \sigma_y^2}}\exp\bigg(-\frac{(x_i-\mu_y)^2}{2\sigma^2_y}\bigg)
\end{equation}
Where:\\
$\mu$ is the mean of the distribution\\
$\sigma^2$ is the variance\\
Using Maximum Likelihood estimation to estimate $\sigma$ and $\mu$ given the
feature vector, a classification for new features can then be calculated as:
\begin{equation}
    \hat{y}=\argmax\limits_y P(y)\prod\limits_{i=1}^nP(x_i\mid y)
\end{equation}
Where:\\
$x$ is the feature vector to be classified\\
$\hat{y}$ is the estimated classification\\

Despite their computational simplicity, Naive Bayes classifiers have been shown
to produce highly accurate classifications models. The assumption that each feature is
completely independent allows for extremely fast classification and scalability
to large datasets, with many dimensions~\parencite[p.300]{Zhang2004}. It was
thought that these benefits would make the classifier suitable for the proposed system, as the relatively high
dimensionality of features and quantity of data points could then be classified
quickly, to obtain initial results. Despite the inclusion of more complex
models, this model was chosen via automatic selection for the final model.
Refer to section~\ref{PSOp} for further details.

\paragraph{Logistic regression}
Logistic regression is a regression model that aims to fit as hyperplane to
data points by minimizing a cost function using weighted features.
By applying weights to feature vectors then applying a sigmoid function, a
hypothesis function is defined as:
\begin{equation}
    h_\theta(x)=\frac{1}{1-e^{-\theta^{T}x}}
\end{equation}
Where:\\
$x$ is a feature vector\\
$y$ is a class label vector \\
$\theta$ is a weight vector \\
A cost function can then be defined as:
\begin{align}
    &J(\theta)=\argmin\limits_\theta\frac{1}{2m}\sum\limits_{i=1}^m\Big(h_\theta(x^{(i)})-y^{(i)}\Big)^2+\text{Regularization}(\theta)\\
    &\text{Regularization}{(\theta)}_\text{L1}=\lambda\sum\limits_{j=1}^n\mid\theta_i\mid\\
    &\text{Regularization}{(\theta)}_\text{L2}=\lambda\sum\limits_{j=1}^n\theta_i^2
\end{align}
Where:\\
$\lambda$ is the regularization parameter used to help prevent overfitting\\
By minimizing the cost function, classification predictions can then be made
using the hypothesis function~\parencite{Ng2012}.\\
Logistic regression was chosen as the meta-classifier primarily due to it's
simplicity and performance in testing. It is thought that this algorithm
performed particularly well as output from base classifiers was linearly
separable and relatively simple (in comparison to input features). The choice of
meta-classifier is a potential area for improvement and it is noted that a
range of meta-classifiers have been proposed for different tasks that utilise
stacking~\parencite[p.29]{Sesmero2015}. Further work in this area could
potentially provide improved results.

\subsection{Model optimisation}\label{optimise}
As discussed in previous sections, two of the most important aspects that affect
the performance of a classification system are it's models, and the input
features. A combination of relevant features and well tuned models is therefore
likely to provide an accurate classification system. However, it is not always
immediately clear which values to choose for parameters, or which features to use as
inputs. This is especially true when given such a wide selection of models to
choose from, and such high dimensional feature spaces, as are used in the
proposed system. To address this issue, two automatic optimisation approaches
were implemented, with the aim of maximising the accuracy of the proposed
system.

\subsubsection{Sequential feature selection}\label{SFS}
It was recognised that the extraction of such large numbers of features in the
proposed system would likely result in a large amount of redundant information.
There are two commonly used methods for addressing this problem: feature
reduction and feature selection. Feature reduction involves reducing features
to a lower dimensionality using techniques such as PCA. Conversely, feature
selection involves selectively removing features entirely via methods such as
Sequential Floating Forward Selection (SFFS). Both aim to reduce the amount of
redundant information in features by removing or reducing features that are not
expected to benefit the model. As a selection of models were to be used, each
potentially handling dimensionality differently (SVMs in particular), it was
decided that feature selection would be most appropriate for this
application.\\

Through experimentation, the chosen method was SFFS. This method is an adaptation
of tradition sequential forward selection, that also uses sequential backward
selection to allow for subsequent removal of added features when necessary.
SFFS is an iterative wrapper method that adds features and re-trains the chosen
model sequentially, choosing features that increase the accuracy of the model
output (using 3-fold cross validation to avoid overfitting). Final models used
as few as 40 features, increasing both accuracy of classifications and
computation time of models significantly.\\
It should be understood that this method does not guarantee a globally optimal
set of features. An exhaustive feature selection algorithm is capable of this
but this would incur significant computational cost. For further details on
SFFS please refer to~\parencite[p.3]{Ferri1994}

\subsubsection{Particle swarm optimisation}\label{PSOp}
The particle swarm optimisation algorithm is an iterative meta-heuristic algorithm that
aims to find the set of parameters that maximises a given function. Given a
$n$ dimensional parameter space, the algorithm randomly initialises sets of
`particles' representing random combinations of parameters. As the algorithm
progresses particles travel through the parameter space, updating their
position based on their velocity, best historical score and the best historical
score of the swarm. As the algorithm iterates, particles will converge on local
optima, producing potential solutions. The best score is chosen after the final
iteration as the best parameter selection. Annotated pseudocode for this
algorithm is shown in code block~\ref{PSCode}~\parencite{Clerc2002}

\onehalfspacing
\begin{lstlisting}[escapeinside={(*}{*)}, label={PSCode}, caption={Particle
Swarm Optimisation Pseudocode}]
Do
    //For all particles...
    For (*$i$*)=1 to Population Size
        // If the current function score is better than the historical
        // function score for the current particle, store new position
        if (*$f(\overrightarrow{x}_i)>f(\overrightarrow{p}_i)\text{ then }\overrightarrow{p}_i = \overrightarrow{x}_i$*)
            // Store best position of all particles in neighbourhood
            (*$\overrightarrow{p}_g=\text{max}(\overrightarrow{p}_{\text{neighbors}})$*)
                // For each dimension in the parameter space...
                For (*$d=1$*) to Dimension
                    // Update velocities
                    (*$v_{id}=v_{id}+\phi_1(p_{id}-x_{id}+\phi_2(p_{gd}-x_{id})$*)
                    // Ensure particle velocity is within limit
                    (*$v_{id}=\text{sign}(v_{id})\cdot \text{min}(\text{abs}(v)_{id}, v_{\text{max}}))$*)
                    // Update particle position
                    (*$x_{id}=x_{id}+v_{id}$*)
                Next (*$d$*)
    Next (*$i$*)
Until termination criterion is met
\end{lstlisting}
\doublespacing

The use of this algorithm allowed for the efficient optimisation of all
parameters relating to the stacking classifier and it's base classifiers,
resulting in a finely tuned classification model that would not have been
producible using traditional trial and error methods to search for optimal
parameters.\\
During the initial design phase, it was found that the abundance
of machine learning algorithms available make selection of the optimal model
difficult, requiring in depth knowledge of a range of machine learning
techniques. A novel approach used by recent stacking classifier applications
has been in the use of meta-heuristic algorithm to select models automatically,
in addition to tuning parameters. By thinking of the base classifiers as
hyperparameters themselves, models can be swapped in and tuned automatically by
the particle swarm algorithm to provide a locally optimal selection of base
classifiers for the model~\parencite{Sesmero2015}. This technique was used to
pick the 3 final models described in section~\ref{class} from a selection of 6
models\footnote{The available models for selection were: Random Forrest,
$k$-Nearest Neighbour, Logistic Regression, Naive-Bayes, SVM (kernel based and
linear models) and AdaBoost classifiers, implemented using Scikit-learn}. This
dynamic selection of models was seen to be one of the key contributors to the
overall success of the algorithm. As with the chosen feature selection
algorithm, this method of optimisation is not guaranteed to provide a globally
optimal solution. It was thought that for the proposed system a locally optimal
system would suffice, particularly given the highly complex parameter space
used in implementation. This is discussed in detail in Section~\ref{ModOp}.

\subsection{Model performance evaluation method}\label{metrics}
In order to fully understand the performance of the system (and to evaluate the
impact of design decisions throughout development), a group of scoring methods
were implemented to test the system's performance in a selection of scenarios.
The aim was to provide reliable metrics that would highlight the system's
strength and weaknesses and to provide quantifiable measures with which to
compare the system to the range of alternative methods proposed in the
literature.\\

One of the most basic metrics was the scoring of the trained model on a
separate hold-out dataset. By reserving a selection of samples from across the
databases, a trained model could then be scored on this dataset for accuracy, sensitivity and
specificity (metrics described in Section~\ref{ChallengeEnt}) to determine the
system's performance on an unseen set of samples. This method is widely used to
provide a basic understanding of a model's ability to generalise to new data, a
crucial requirement of the system. Data was split using a grouped stratified shuffle
split, grouping by database. This ensured an equal number of randomly selected
classes were taken from each database to produce training and test sets. This
approach was taken to avoid class imbalance issues caused when there is a
significant difference in class frequency, as detailed in
Section~\ref{Resample}. It should be noted that although samples were
stratified by class, it was not possible to stratify samples by patient. This
may have an impact on results, as the presence of data from the same patient in
both training and test set may artificially inflate results, where the model has
learnt patterns specific to that patient that do not generalise to others.\\

A more robust method for for model evaluation is cross-validation. By splitting
the full dataset into multiple folds, and training models on each, metrics can
be calculated on each fold, and an average can be taken to provide a measure of
the system's performance over all folds. 10-fold cross validation, stratified
by class, was chosen for evaluation of the system. This provides an insight
into the performance of the algorithm across the dataset. This is a common
method used by all participants of the Physionet challenge and is commonly found
in prior literature.\\
It is highlighted by Homsi et.\ al, that a large amount of variance may be
observed across folds~\parencite[p.1637]{Homsi2017}. This is attributed to the
variations across databases, making generalisation difficult. To account for
this, it is suggested that cross-validation is repeated multiple times and
averaged to provide a more accurate measurement of performance across folds.
For the proposed system, cross validation was repeated 10 times for each fold
and averaged to produce the final results. Standard-deviation is also
calculated across these iterations to illustrate the possible prevalence of
this.\\

A common theme throughout the literature was that of generalisation across
databases. It was observed that many previous algorithms achieved high
accuracies in standard cross-validation, but performed significantly worse when
testing on unseen databases~\parencite{Homsi2017, Bobillo2016}. For this
reason, leave-one-out cross-validation was used to form a better understanding
of the system's ability to generalise to unseen data from different sources. On
each fold, a single database is removed, training on all other databases. This
is a useful method as it can be used to determine the level to which
information extracted from databases is representative of other databases.\\

The evaluation of models using cross-validation was not limited to final
evaluation. Evaluation of intermediate models generated by both the SFFS and Particle Swarm
algorithms was possible, by further separating the test set into test and
validation fold. This technique was used in the optimisation of the
final model to provide scores for intermediate models, as well as to help avoid
overfitting parameters and features to the training set used for
optimisation.\\

Discussion on the performance of the proposed system using these methods can be
found in Section~\ref{Eval}.
% TODO: Insert cross validation diagram from data science handbook

\section{Implementation}
This section describes the tools used in the realisation of the
proposed system, the practical issues encountered throughout the
implementation process and the development strategy taken to address and avoid
such issues. Rationale is given for decisions made throughout
production of the proposed system and any known issues with the current implementation are
outlined.

\subsection{Development strategy}
Early in the design process it became apparent that in order for this project
to produce reasonable results, it would need to utilise a number of complex
algorithms to handle the various non-trivial problems that were encountered
throughout development. Python was chosen as the most suitable language for the
implementation of the system. High level language features such as dynamic
types and automatic garbage collection, combined with the large variety of
readily available packages and libraries, make the language a good choice for
the fast, flexible development approach taken throughout this project.\\

The most significant objective from the outset of the project was to provide a
system that could classify pathological signals with a degree of accuracy that
was comparable to the current state of research in the field of PCG analysis.
Given this focus and that the performance of the final product was initially
unknown, it was recognised that the design of the project would need to adapt
as the project progressed, implementing and testing high level concepts to
iteratively improve performance of the system. For this reason a high level
view of production was taken, choosing to focus on the overall system
architecture, rather than spending great amounts of time on on any one specific
element of the project (as any component of the project could be
removed/replaced entirely, if this facilitated the improvement of results).\\

With this design ethos in mind, it was decided that external packages and
libraries would be used/adapted wherever necessary to avoid spending large
amount of time developing proprietary implementations of proven concepts. By
not `reinventing the wheel', it would be possible to rapidly prototype and
evaluate high level concepts, such as the variety of machine learning and
optimisation algorithms detailed in previous sections, quickly and effectively.
Due to it's active developer community, a wide range of scientific computing
and machine learning algorithms are available, such as
NumPy~\parencite{VanDerWalt2011}, SciPy~\parencite{Millman2011} and
Scikit-Learn~\parencite{Pedregosa2011}, each of which was used extensively
throughout the project alongside other packages detailed in the following
sections.

\subsection{System overview}
The proposed system can be broken down into 5 key components: the user
interface, feature generation module, classification module, optimisation
module and evaluation module. The overall architecture of the system follows a
common design pattern for machine learning based systems; Taking a set of input
data, augmenting to produce associated data, extracting patterns from said
data and evaluating to understand the effects of various implementation
choices. The reason for this maintain focus on the central innovations of the
project which are thought to be in the implementation of models and the
analysis of input data.

\subsubsection{User interface and project framework}
As a project focused on producing a functional proof of concept for a specific
problem, this project was not aimed at producing a user-facing application. For
this reason, interactions with the system are based primarily around a simple,
well documented commandline interface (CLI) that allows a user to specify
relevant parameters through use of CLI flags (CLI output flags are detailed in
Appendix~\ref{appendixB}). This gives the user relevant
control over optional parameters that are situation specific (such as
multiprocessing, specifying locations to save data etc\ldots) through an interface that is both easy to maintain throughout
development and intuitive for a user familiar with command line applications.
Relevant information is then presented to the user as the program progresses
through the use of standard output and file based logging systems. Providing
informative feedback to the user was essential, both during development for
debugging processes, and to provide an understanding of the programs progress,
particularly in long-running iterative processes used for optimisation.\\
A file based logging system was developed using Python's built-in logging
module to allow for the monitoring of threaded processes. This allowed for
detailed monitoring of the systems progress, even when running multiple
operations concurrently.\\

A significant issue that developed as the project grew in size and complexity
was the running time. As more complex methods were implemented for feature
extraction and model optimisation, the time taken to process the relatively
large dataset grew considerably. Primarily using Python's object pickling
functionality, Methods were implemented to handle the creation and maintenance
of intermediate feature and parameter files, allowing the program to load
previously generated data. This helped in cutting  computation times of
subsequent runs significantly. This also provided a convenient facility for
storing models, in a portable fashion for transfer between computers, for
example.

\subsubsection{Features extraction}
As a fundamental component of the classification system, the feature extraction
method required careful planning and development to realise the extraction of
the large number of features. As a feature of particular importance, the
integration of the MATLAB segmentation algorithm was a key part of the feature
extraction method. Given a pre-made MATLAB implementation, the most problematic
step was the integration of this code seamlessly with the largely incompatible
Python framework. This was achieved through use of Python's Subprocess module,
allowing for the code to be called through an external unix process. The nature
of this implementation limits the compatibility of the system to unix only
systems. However, given the nature of the project, this is not considered an
issue.\\

Having extracted segmentation data, methods for the efficient calculation and
storage of other features were considered. It was recognised that in order to
maintain such a large number of features, an organised and efficient data
format would be needed. The popular Pandas DataFrame library was chosen for
this task. Built on the NumPy array object, DataFrames provide a labelled data
structure, ideal for storing large quantities of labelled data. As the DataFrame
is based on the Numpy array it is able to combine the powerful matrix
operations of Numpy with intuitive database queries commonly seen in languages
such as SQL. This data structure also allows for simplistic storing and loading
of DataFrames through built-in interfaces to the HDF5 file system, simplifying
the storage of large quantities of intermediate data.\\
A minor issue that was found during development was the lack of compatibility
between DataFrames and Scikit-Learn (described in Section~\ref{Sklearn}). This
simply required careful handling of operations between these APIs in order to
avoid unintentional mishandling of data.\\

As mentioned previously, the calculation of features often required many
mathematical operations to be performed on vectors. The use of Numpy array-like
containers simplified this operation, allowing many features to be calculated
in a single line of code. Standard operations were sufficient for a number of
the features. However, mathematically complex features such as MFCCs and
wavelet transforms required dedicated processing libraries. The open source
pyWavelets and Librosa project were used for initial generation of MFCCs and
the DWT respectively~\parencite{pyWave, Mcfee2015}. Further processing such as
segmentation, entropy analysis etc\ldots is calculated to produce the final
features. For a details of all features please refer to
Appendix~\ref{appendixA}.\\

Given the large number of operation required for feature extraction, a large
amount of time needed to compute features was an unavoidable consequence of
the design. To help alleviate this issue, processing of features was
parallelised, using each sample as an individual job. The speed-up aquired
through parellisation is inherently dependant on the system running the
program, however, this significantly reduced the computation time of features.
A modified implementation of Python's multiprocessing module was used for task
management.

\subsubsection{Classification model generation}\label{Sklearn}
The Scikit-learn machine learning package is a popular choice for
implementation of machine learning algorithms in
Python.~\parencite{Pedregosa2011} As a result, it is well maintained and
benefits from a number of smaller projects that are designed to be compatible
with it's API. Such projects include Mlxtend, designed to expand Scikit-learns
range of features with more esoteric models and tools~\parencite{Raschka2016,}
and Imbalance-learn, which addresses the lack of sample balancing
transformations in Scikit-learn~\parencite{Lemaitre2017}. The project's aim to
provide a standard interface to each of it's algorithms also aids in the quick
prototyping of many machine learning algorithms on the features generated.
During the initial stages of development, this allowed for models to be tested
quickly to gain a rudimentary understanding of the performance of models. From
this, it was possible to explore more complex models and ultimately resulted in
the use of Mlxtend's Stacking ensemble classifier.\\

As the size of the classification codebase grew, so did the number of chained
operations in the classification process. Scikit-learn's Pipeline utility was
used to maintain correct handling of these process chains, allowing for models
to be easily formed of multiple transforms and classifiers in a manageable
manner. This gained particular significance with the implementation of model
optimisation methods, as discussed in the following section.

\paragraph{Model selection/optimisation}\label{ModOp}
The decision to use an ensemble classifier also presented a complication,
through the need to choose multiple complimentary base classifiers. A further
issue was in the growing number of parameters that would need to be tuned to
obtain optimal results from each of the chosen base classifiers. The adaption
of a particle swarm optimisation algorithm was found to be an effective
solution to both of these problems~\parencite{Claesen2014}. Using model
pipelines to encapsulate all transformations and classifiers into a single
object, it was possible to construct a dictionary of multiple potential
Scikit-learn models for each base classifier.  Implementation of a wrapper
function around the full stacking classifier then allowed classifier pipelines
to be treated as hyperparameters, which could be optimized on a training set
alongside all active parameters for each base classifier. This resulted in the
simultaneous selection and optimization of model combinations and
hyperparameters.\\

Mlxtend's implementation of SFFS was also implemented to apply feature reduction to
the dataset. This was initially implemented as part of the processing pipeline,
resulting in feature reduction being applied on every iteration of parameter
optimisation. This initially provided a score for each model based on the
selection of it's optimal parameters. However, as models grew in complexity,
the growing computational complexity resulted in this method becoming infeasible.
This was addressed by re-implementing the feature selection algorithm after
optimisation on the full dataset. The reduction in performance from this is not
thought to be significant.\\

As with parameters, models were saved using a combination of Python's pickling
functionality and Panda's HDF5 export methods, to create fully portable models.

\subsubsection{Automatic system evaluation}
In order to accurately place the system in the context of current research,
evaluation metrics were needed to perform automatic testing of the system.
Metrics were implemented as described in Section~\ref{metrics} using a custom
multi-scorer object that was adapted to allow for the calculation of the 3
metrics: sensitivity, specificity and score. Using this object in conjunction
with a selection of Scikit-learns cross-validation objects provided a mechanism
for quickly evaluating models in an equivalent fashion to those presented in
the literature. In addition, a simple test train split was implemented allowing
for the use of a hold-out dataset. Performance on this dataset was evaluated
directly using the custom scoring functions and Scikit-learn's model scoring
methods.\\
Finally, results were formatted into tables and logged to provide instant
feedback to the user on the performance of the current model.

\section{Results}\label{Eval}
The system was evaluated using 3 primary scoring methods (as described in
Section~\ref{metrics}).
\begin{itemize}
    \item Score on hidden test set
    \item Leave-one-out database cross-validation
    \item 10-fold stratified cross-validation
\end{itemize}

The final optimised model was generated using a total of 43 selected features
out of a possible 188. Parameter optimisation was run with 1000 parameter
evaluations, resulting in 50 iterations using 20 particles. Final parameters
and selected features
for the chosen algorithms are detailed in table~\ref{OpParam}.\\
The final scores produced for this model, evaluated using the full dataset, can
be found in Table~\ref{TestSet}. Scores for Leave-one-out cross-validation and
10-fold cross-validation can be seen in Figure~\ref{fig1} and Figure~\ref{fig2}
respectively. Details can be found in Appendix~\ref{appendixD}.

\begin{table}[H]
\centering
\caption{Hidden test-set scoring}
\label{TestSet}
\begin{tabular}{@{}lll@{}}
\toprule
$Acc$  & $Se$    & $Sp$    \\ \midrule
80.31\% & 83.15\% & 77.47\% \\ \bottomrule
\end{tabular}
\end{table}

% Make lists without bullets and compact spacing
\renewenvironment{itemize}{
  \begin{list}{}{
    \setlength{\leftmargin}{1.5em}
    \setlength{\itemsep}{0.25em}
    \setlength{\parskip}{0pt}
    \setlength{\parsep}{0.25em}
  }
}{
  \end{list}
}
\setlist[enumerate]{itemsep=0.25em}

\begin{table}[H]
\centering
\caption{Optimised model parameters and selected features}
\scriptsize
\label{OpParam}
\begin{tabulary}{\linewidth}{LLLL}
\toprule
Base Classifier A & Base Classifier B     & Base Classifier C  & Meta Classifier            \\ \midrule
Model: Linear SVM & Model: RBF Kernel SVM & Model: Naive-Bayes & Model: Logistic Regression \\
C: 4.2507         & C: 4.9452             &                    & C: 14.3611                 \\
                  & Gamma: 0.4558         &                    & Penalty: L2                \\ \bottomrule
\end{tabulary}
\singlespacing
\begin{multicols}{5}
    \scriptsize
\begin{itemize}
    \item AvrA5diaShan
    \item AvrA5s1Shan
    \item AvrA5s2Shan
    \item s2MFCC0
    \item AvrD1s2Shan
    \item s2MFCC12
    \item s2MFCC4
    \item s2MFCC6
    \item s2MFCC9
    \item s2Max
    \item s2Mean
    \item AvrD4s2Shan
    \item AvrD5s2Shan
    \item s2ZeroX
    \item sd\_RR
    \item TPTs1
    \item s2Dur
    \item sysMFCC2
    \item sysMax
    \item sysSampEnt
    \item sysShanEngy
    \item sysSkew
    \item sysVar
    \item diaMFCC11
    \item diaMFCC12
    \item diaMFCC6
    \item diaSampEnt
    \item diaShanEngy
    \item diaVar
    \item diaZeroX
    \item heartRate
    \item m\_RR
    \item m\_Ratio\_DiaRR
    \item mean\_IntS1
    \item mean\_IntSys
    \item s1Dur
    \item s1MFCC11
    \item s1MFCC4
    \item s1Max
    \item s1Mean
    \item s1ShanEngy
    \item s1Var
    \item s1ZeroX
\end{itemize}
\end{multicols}
\end{table}
\begin{figure}[H]
    \caption{Leave-one-out cross-validation results (mean and std-dev)}
    \makebox[\textwidth]{\includegraphics[width=1.1\textwidth]{logo}}
    \label{fig1}
\end{figure}
\begin{figure}[H]
    \caption{Stratified 10-fold cross-validation results (mean and std-dev)}
    \makebox[\textwidth]{\includegraphics[width=\textwidth]{10_fold}}
    \label{fig2}
\end{figure}
Due to the replication of the approach taken for scoring entries to the Physionet
challenge, it is possible to directly compare results to challenge entries.
This aims to provide a thorough understanding of the performance of the
proposed system in relation to other approaches. The system is further compared
to some successful algorithms prior to the challenge in the subsequent section,
in order to understand the performance of the system in a wider context of
heart sound analysis.\\

The most directly comparable results are to those presented by challenge
participants, used during the training of their algorithms. Many participants
used similar cross-validation scores to determine the performance of their
algorithm before testing on the final hidden dataset, and these provide a key
insight into the performance with regard to a variety of aspects.\\

Results obtained using the Leave-one-out cross-validation scoring are similar
to those of the highest scoring algorithms in the
challenge~\parencite{Homsi2017, Bobillo2016}. As a measure for performance on
unseen data, this suggests that the proposed algorithm generalises to a similar
degree. However, it is clear that algorithms generally score poorly in this
area. This is the general consensus across many of the algorithms presented for
the challenge and is a problem that requires further work. Higher scores in
10-fold cross validation than those of Leave-one-out cross-validation further
suggest that the algorithm is highly susceptible to degraded results, most
likely as a consequence of signal qualities varying from those of the training
set.\\
In addition, scores were also applied to the algorithm after balancing the full
database. The aim of this was to remove class imbalance across the training
and test set, to gain an understanding of how the model performs on each class
equally. Results of these tests can be viewed in Appendix~\ref{appendixC}. It
was found that, although hidden test set and 10-fold cross-validation scores
aren't affected by class imbalance, there is a significant increase in the overall
leave-one-out cross-validation score from 54.16\% to 66.13\%. This is currently
thought to be caused by the model not resampling by database during training.
As resampling during training does not maintain the balance between datasets, a
disproportionate number of normal samples from larger datasets is likely to be
present in the training set. This may cause the model to overfit this quality
of sample recording, resulting in poor classification on the left out test set.
It should however be noted that the significant reduction in samples caused by
the resampling makes fitting a model that generalises to other databases easier,
thus generating inflated results. However, when comparing the balanced database
results to those of Kay et al., overall performance is still found to be
significantly better than that of their proposed algorithm, despite use of a
larger balanced dataset~\parencite{Kay2017}. This is an area that requires
further investigation, and could potentially improve results of the algorithm
considerably.

A further issue that has been highlighted throughout previous literature, is
the use of different recording equipment for the recording of normal and
pathological signals in database $e$~\parencite{Kay2017, Bobillo2016}. It is
suggested that this may have a significant impact on results, as an algorithm
may inadvertently learn the difference in noise characteristics of signals,
rather than of the heart sounds themselves. This is of particular importance as
database $e$ is the largest available dataset. This issue is found in results
through examination of database $e$ in participant's leave-one-out
cross-validation scores. This may also be true in the case of the proposed
system as database $e$ has shown considerably higher specificity in results
than those the other database, both in balanced and unbalanced datasets.
Further would be needed to understand the extent of the effect that this has on
the performance of the proposed system.\\

The final 10-fold cross-validation score was found to be between, 2 and 12\%
less than those of the highest scoring models~\parencite{Zabihi2016, Homsi2017,
Kay2017}. This suggests that, when presented with signals similar to those
presented in training, the proposed model is not able to find distinguishable
patterns between healthy and pathological signal to the same degree as the
leading algorithms in the field. However, accuracy scores of 80\% show promise
for this method. Sensitivity and specificity scores are roughly equal,
suggesting little bias between normal and abnormal classes. The method also
performed equally well on the hidden test set.  The hidden test set score is
the only score based on predictions where no samples had previously been seen
by the algorithm during optimisation. A similar score to that of 10-fold cross
validation suggests that chosen features and hyperparameters generalise well to
unseen data. If scores in cross validation had been significantly higher than
that of the hidden test set, it would suggest that the model is tuning
parameters and features in a way that only benefits the score of the training
set. These scores suggest that the model is reasonably robust, and that with
improvements detailed in section~\ref{FutureWork} there is considerable
potential for this method.\\

Some of the most successful algorithms prior to the challenge also achieve
similar levels of success (As detailed in Table~\ref{PriorWorkTable}, although
these methods were evaluated on different datasets, so are not as directly
comparable)~\parencite{Ari2010, Quiceno-Manrique2010a, Maglogiannis2009}. This
clearly shows that high levels of accuracy are attainable through machine
learning methods for PCG classification. In addition these methods also venture
further into the area of classification for specific pathologies, a topic not
widely considered for the challenge.


\section{Discussion and further work}\label{FutureWork}
The current implementation of the system has provided promising results,
suggesting that the combination of techniques is well suited to the task of
abnormality detection. It is clear however, that further development of the
system could improve results further. This section defines some of the
recognised issues that could be addressed in each of the system's components,
in order to build on the work of the project in it's current form. Areas of
interest are also outlined to suggest potential avenues for building on the
system's current functionality.\\

A potential flaw in the current project is that beyond segmentation of the
original signal, pre-processing (and other components of the system) currently
make little use of biomedical domain knowledge to aid in processing of the
input data.  This is largely due to the author's lack of background in this
area, prior to development of this project. An example of a project that has
implemented this is the work by Goda et al.\ who, by recognising that trained professionals
can classify most heart conditions, given at least 5 seconds of audio, was able to
further segment audio in 5 second overlapping segments, essentially providing
additional atomic samples for training~\parencite{Goda2016}. It is thought that
other such assumptions based on physiological understanding could be made in
order to further enhance the system.\\

Another area that requires further attention is the implementation of the resampling
method, as discussed in Section~\ref{Eval}. It has been shown that there is a
significant possibility for improving leave-one-out cross-validation scores by
implementing resampling per-database. In terms of effort, this would be a
relatively simple implementation and was not included in the project's current
form primarily due to comprehension of the issue late in the project. It is
hoped that this would produce considerable improvements in the generalisation
performance of the project. In addition to this, resampling could conceivably
be integrated in the hyperparameter optimisation method, in a similar fashion
to the models themselves, allowing for selection of the resampling methods
dynamically. This has the potential to improve accuracy of the classifier by
providing an optimal selection of training data for the task.\\

The combination of hyperparameter and model optimisation was regarded as one of
the key reasons for it's success. The current method, although sufficient for
the task, may be further improved through deeper understanding of the models,
the interaction of the models in stacking classifiers, and the parameter
optimisation method.\\
For example, in the final selection of models, a linear SVM, RBF kernel SVM and
Naive Bayes models were chosen by the system. From intuition it is thought that
the reason these worked well is due to the complex combination of linear and
non-linear relationships in the input features. As the RBF kernel is well
suited to differentiating non linear patterns, and the linear SVM is well suited
for linear patterns, these models would in theory compliment one another. This
is also true of the Naive-bayes model, which considers each feature in
isolation from all others, contrasting the complex inter-feature relationships
(such as those most likely present in the MFCC and wavelet coefficients, for
example) considered by the other models. Given this intuition, more targeted
models could be provided to the dictionary for model selection, allowing for
more efficient convergence of the optimisaton algorithm. This theory is
however, simply conjecture at the current stage, and would require further
research to form a better understanding of the problem.\\
Building on this, it is known that a wide variety of optimisation methods, such
as genetic algorithms and swarm algorithms~\parencite[p.32]{Sesmero2015}.
Deeper understanding of these may yeild a more suitable optimisation method
than the tradition particle swarm method used in the current implementation.\\

The features in the current project clearly provided a reasonable
representation of the signal, suitable for the classification task. This was
largely due to the selective procurement of these features from literature,
where they had been demonstrated to provide useful representations of audio
both for PCG signals and in a more general context. Further analysis of
individual features and the feature set as a whole may allow for a more refined
feature set to be developed. This is in contrast to the current methodology
which largely relies on the trial and error approach of SFFS. A deeper
understanding may allow for more complex and context specific features to be
added to the feature set, and for other ineffective features to be removed.\\ An
example of this is the implementation of MFCC features. Although the current
MFCCs were seen to be relevant features, the coefficients are used directly in
a rather simplistic fashion, without any further processing. Other
implementations have shown that further processing could be used to augment
these features~\parencite{Zabihi2016}, providing additional potentially relevant
statistical data to the feature-set.\\

An application of the system that has not been considered in any great depth is
the further suitability of the method for more specific cardiovascular
classification tasks. A number of projects prior to the challenge have
demonstrated the success of methods similar to the proposed method in this
area~\parencite{Ari2010, Quiceno-Manrique2010a}. The work of Ari et al.\ is
of particular interest, as it demonstrated a reasonable general classifier
using similar SVM based classifiers to that of the current project, but also
aimed to further classify the signal using subsequent nested classifiers. This
method could be applied to potentially develop a highly useful diagnostic tool.\\

Given the quality of results presented, it is believed that this project has
successfully demonstrated the potential for heuristically optimised stacking
classifiers to detect pathological heart sounds. With additional improvements,
it is thought that the proposed methodology could be built upon, potentially
providing an innovative alternative solution to the currently used methods for
heart sound analysis.

\pagebreak

\appendix
\section*{Appendices}
\addcontentsline{toc}{section}{Appendices}
\renewcommand{\thesubsection}{\Alph{subsection}}

\subsection{Table of Features}\label{appendixA}
\begin{table}[H]
\centering
\caption{Description of features}
\onehalfspacing
\tiny
\label{my-label}
\begin{tabulary}{\linewidth}{LLLLL}
\hline
Tag                        & Feature Name                                    & Description                                                                          \\ \hline
heartRate                  & Heart Rate                                      & The number of beats per minute (BPM)                                                 \\
m\_RR                      & Mean RR Interval                                & Average length of heart cycles                                                       \\
sd\_RR                     & Std-dev RR Interval                             & Standard deviation of heart cycles                                                   \\
mean\_Int*                 & Mean Segment Interval                           & Average length of segment                                                            \\
sd\_Int*                   & Std-dev Segment Interval                        & Standard deviation of segment                                                        \\
R\_SysRR                   & Systolic-RR Ratio                               & Ratio of Systolic interval to RR interval                                            \\
R\_DiaRR                   & Diastolic-RR Ratio                              & Ratio of Diastolic interval to RR interval                                           \\
R\_SysDia                  & Ratio of Systolic-RR/Diastolic-RR Ratios        & Ratio of above ratios                                                                \\
*ZeroX                     & Zero-crossing                                   & Zero-crossing rate of a segment                                                      \\
*RMS                       & Root Mean Square                                & The Root Mean Square of a segment                                                    \\
*ShanEngy                  & Shannon Energy                                  & The Averaged Shannon Energy Envelope of a segment                                    \\
*Dur                       & Duration                                        & The duration of a segment                                                            \\
*Max                       & Max                                             & The peak value of an absolute segment                                                \\
*Mean                      & Mean                                            & The mean value of a segment                                                          \\
*Skew                      & Skewness                                        & The temporal skewness of a segment                                                   \\
*Kurt                      & Kurtosis                                        & The temporal kurosis of a segment                                                    \\
*Var                       & Variance                                        & The variance of a segment                                                            \\
*SampEnt                   & Sample Entropy                                  & The sample entropy of a segment                                                      \\
*ShanEnt                   & Shannon Entropy                                 & The Shannon Entropy of a segment                                                     \\
TPT*                       & Total Power (time)                              & The total power of a segment in the time domain                                      \\
TPF*                       & Total Power (frequency)                         & The total power of a segment in the frequency domain                                 \\
*Flat                      & Spectral Flatness                               & The flatness of a segment's frequency spectrum                                       \\
*Cent                      & Spectral Centroid                               & The centroid of a segment's frequency spectrum                                       \\
*Spread                    & Spectral Spread                                 & The spread of a segment's frequency spectrum                                         \\
*MFCC\{$n$\} & Mel-frequency Cepstrum Coefficients             & MFCC coefficient number $n$, where $n = \{1, \ldots, 13\}$                           \\
m\_Ratio\_SysRR            & Mean RR/Systole interval ratio                  & Mean value of the interval ratios between systole and RR in each heart beat          \\
sd\_Ratio\_SysRR           & Std-dev RR/Systole interval ratio               & Std-dev value of the interval ratios between systole and RR in each heart beat       \\
m\_Ratio\_DiaRR            & Mean RR/Diastole interval ratio                 & Mean value of the interval ratios between diastole and RR in each heart beat         \\
sd\_Ratio\_DiaRR           & Std-dev RR/Diastole interval ratio              & Std-dev value of the interval ratios between diastole and RR in each heart beat      \\
m\_Ratio\_SysDia           & Mean Systole/Diastole interval ratio            & Mean value of the interval ratios between systole and diastole in each heart beat    \\
sd\_Ratio\_SysDia          & Std-dev Systole/Diastole interval ratio         & Std-dev value of the interval ratios between systole and diastole in each heart beat \\
D[1-5]Shan             & Detail coefficient shannon entropy              & Shannon entropy of DWT detail coefficient 1-5                                        \\
A5Shan                     & Approximation coefficient shannon entropy       & Shannon entropy of DWT approximation coefficient 1-5                                 \\
\mbox{TotD[1-5]*Shan}         & Total detail coefficient shannon entropy        & Total Shannon entropy of DWT detail coefficient 1-5 across signal                    \\
TotA5*Shan                 & Total approximation coefficient shannon entropy & Total Shannon entropy of DWT approximation coefficient 1-5 across signal             \\ \hline
\end{tabulary}
\justifying
\scriptsize
Feature sources include:~\parencite{Homsi2016, Schmidt2015, Liang1998,
Lerch2012}\\
`*' --- denotes feature is applied to S1, systolic, S2 and diastolic segments
respectively.
\end{table}
\pagebreak


\subsection{Commandline Interface}\label{appendixB}
\singlespacing
\lstset{basicstyle=\tiny, style=mystyle}
\begin{lstlisting}[numbers=none]
usage: main.py [-h] [--features-fname OUTFNAME] [--segment] [--optimize]
               [--eval EVAL] [--select-features SELECT\_FEATURES] [--backward]
               [--parameters_fname OUTFNAME] [--fs_fname OUTFNAME]
               [--no-parallel] [--reanalyse] [--verbose]
               [--resample-mix RESAMPLE_MIX] [--keep-logs]
               TESTDIR OUTDIR

Script for the classification of PCG data.

positional arguments:
  TESTDIR               Directory of test data to train the system
  OUTDIR                Directory to store output analyses

optional arguments:
  -h, --help            show this help message and exit
  --features-fname, -o
                        Specify the name of the file to save generated
                        features to for future use
  --segment             Run Matlab segmentation script to create segmentation
                        analysis
  --optimize            Run optimization algorithm to find best model and
                        parameters for classifier
  --eval, -e            Number of evaluation to pass to the particle swarm
                        optimization
  --select-features
                        Run feature selection algorithm to find best features
                        for model, either selecting or reducing features by
                        the integer specified. This depends on use of
                        --backward flag, to determine forward or backward
                        feature selection. (a value of 0 skips feature
                        selection entirely, using previously generated
                        features if available. A value less than 0 uses all
                        available features.)
  --backward, -b        Runs backward feature selection as opposed to default
                        forward selection.
  --parameters_fname
                        Specify the name of the file to save generated
                        features to for future use
  --fs_fname            Specify the name of the file to save generated feature
                        selection model to for future use
  --no-parallel, -p     Disable processing in parallel. (Will likely decrease
                        performance but may aid in debugging)
  --reanalyse           Force regeneration of database features
  --verbose, -v         Specifies level of verbosity in output. For example:
                        '-vvvvv' will output all information. '-v' will output
                        minimal information.
  --keep-logs           Keep previously generated logs that aren't overwritten
                        by current process
\end{lstlisting}
\doublespacing
\pagebreak{}

\subsection{Final results}\label{appendixD}
Results of of tests on final optimised model\\
Leave-one-out scores are shown in Table~\ref{LOGO}\\
Stratified cross-validation scores can be found in Table~\ref{KFCV}\\

\begin{table}[H]
\doublespacing
\caption{Leave-one-out scores}
\label{LOGO}
\footnotesize
All scores are an average of 10 iterations $\pm$ standard-deviation
\scriptsize
\centering
\begin{tabulary}{\linewidth}{LCCCCCCC}
\toprule
       & A                 & B                 & C                 & D                 & E                 & F                 & Mean              \\ \midrule
$Acc$ & $0.5395\pm0.0104$ & $0.4896\pm0.0129$ & $0.5673\pm0.0298$ & $0.5173\pm0.0223$ & $0.5869\pm0.0300$ & $0.5492\pm0.0140$ & $0.5416\pm0.0318$ \\
$Se$   & $0.7281\pm0.0164$ & $0.8664\pm0.0240$ & $0.6775\pm0.0208$ & $0.7865\pm0.0218$ & $0.5397\pm0.0459$ & $0.7387\pm0.0493$ & $0.7228\pm0.1005$ \\
$Sp$   & $0.3509\pm0.0264$ & $0.1127\pm0.012$  & $0.4571\pm0.0571$ & $0.2481\pm0.0416$ & $0.6340\pm0.0387$ & $0.3596\pm0.0464$ & $0.3604\pm0.1624$ \\ \bottomrule
\end{tabulary}
\end{table}

\begin{table}[H]
\caption{10-fold cross-validation score}
\footnotesize
All scores are an average of 10 iterations $\pm$ standard-deviation
\doublespacing
\label{KFCV}
\scriptsize
\centering
\begin{tabulary}{\linewidth}{LCCCCCCCCCCC}
\toprule
       & 1                 & 2                 & 3                 & 4                 & 5                 & 6                 & 7                 & 8                 & 9                 & 10                & Mean              \\ \midrule
$Acc$ & $0.7969\pm0.0246$ & $0.8049\pm0.0244$ & $0.8043\pm0.0153$ & $0.8111\pm0.0295$ & $0.8095\pm0.0261$ & $0.7999\pm0.0208$ & $0.8061\pm0.0299$ & $0.8150\pm0.0198$ & $0.8140\pm0.0245$ & $0.7928\pm0.0224$ & $0.8055\pm0.0069$ \\
$Se$   & $0.8121\pm0.0420$ & $0.8164\pm0.0360$ & $0.8193\pm0.0302$ & $0.8184\pm0.0634$ & $0.8158\pm0.0484$ & $0.8061\pm0.0438$ & $0.8325\pm0.0546$ & $0.8421\pm0.0321$ & $0.8246\pm0.0474$ & $0.7798\pm0.0302$ & $0.8167\pm0.0157$ \\
$Sp$   & $0.7818\pm0.0293$ & $0.7935\pm0.0267$ & $0.7894\pm0.0208$ & $0.8037\pm0.0280$ & $0.8033\pm0.0226$ & $0.7937\pm0.0214$ & $0.7798\pm0.0229$ & $0.7878\pm0.0206$ & $0.8035\pm0.0219$ & $0.8059\pm0.0228$ & $0.7942\pm0.0091$ \\ \bottomrule
\end{tabulary}
\end{table}


\pagebreak
\subsection{Balanced dataset test results}\label{appendixC}
Results of testing database using a resampled, balanced dataset.\\
Dataset was resampled by database, using jacknife resampling (Sampling without
replacement) and consisted of a total of 944 samples.
\begin{table}[H]
\centering
\caption{Hidden test-set scoring}
\begin{tabular}{@{}lll@{}}
\toprule
$Acc$  & $Se$    & $Sp$    \\ \midrule
80.77\% & 79.41\% & 82.14\% \\ \bottomrule
\end{tabular}
\end{table}

\begin{table}[H]
\doublespacing
\caption{Leave-one-out scores}
\footnotesize
All scores are an average of 10 iterations $\pm$ standard-deviation
\scriptsize
\centering
\begin{tabulary}{\linewidth}{LCCCCCCC}
\toprule
       & A                 & B                 & C                 & D                 & E                 & F                 & Mean              \\ \midrule
$Acc$ & $0.5784\pm0.0153$ & $0.6062\pm0.0131$ & $0.8737\pm0.0131$ & $0.5939\pm0.0302$ & $0.7022\pm0.0136$ & $0.6130\pm0.0189$ & $0.6613\pm0.1029$ \\
$Se$   & $0.4984\pm0.0225$ & $0.8816\pm0.0000$ & $0.7475\pm0.0261$ & $0.6692\pm0.0319$ & $0.6417\pm0.0227$ & $0.7290\pm0.0636$ & $0.6946\pm0.1161$ \\
$Sp$   & $0.6585\pm0.0342$ & $0.3309\pm0.0262$  & $1.0000\pm0.0000$ & $0.5185\pm0.0741$ & $0.7628\pm0.0134$ & $0.4971\pm0.0509$ & $0.6280\pm0.2141$ \\ \bottomrule
\end{tabulary}
\end{table}

\begin{table}[H]
\caption{10-fold cross-validation score}
\footnotesize
All scores are an average of 10 iterations $\pm$ standard-deviation
\doublespacing
\scriptsize
\centering
\begin{tabulary}{\linewidth}{LCCCCCCCCCCC}
\toprule
       & 1                 & 2                 & 3                 & 4                 & 5                 & 6                 & 7                 & 8                 & 9                 & 10                & Mean              \\ \midrule
$Acc$ & $0.7961\pm0.0364$ & $0.8187\pm0.0384$ & $0.8108\pm0.0238$ & $0.8019\pm0.0316$ & $0.8103\pm0.0326$ & $0.8217\pm0.0417$ & $0.7845\pm0.0593$ & $0.8053\pm0.0262$ & $0.8023\pm0.0148$ & $0.8105\pm0.0312$ & $0.8062\pm0.0103$ \\
$Se$   & $0.7922\pm0.0603$ & $0.8060\pm0.0453$ & $0.7853\pm0.0363$ & $0.7772\pm0.0489$ & $0.8044\pm0.0504$ & $0.8070\pm0.0538$ & $0.7553\pm0.0700$ & $0.7623\pm0.0312$ & $0.7772\pm0.0272$ & $0.8088\pm0.0461$ & $0.7876\pm0.0184$ \\
$Sp$   & $0.8000\pm0.0360$ & $0.8315\pm0.0504$ & $0.8363\pm0.0289$ & $0.8266\pm0.0366$ & $0.8161\pm0.0351$ & $0.8363\pm0.0627$ & $0.8137\pm0.0578$ & $0.8484\pm0.0380$ & $0.8274\pm0.0284$ & $0.8123\pm0.0412$ & $0.8249\pm0.0136$ \\ \bottomrule
\end{tabulary}
\end{table}
\pagebreak
\printbibliography{}

\end{document}