\documentclass[titlepage, 12pt]{scrartcl} \usepackage{enumitem} \usepackage[british]{babel} \usepackage[style=apa, backend=biber]{biblatex} \DeclareLanguageMapping{british}{british-apa} \usepackage{url} \usepackage{float} \usepackage{ragged2e} \usepackage{caption} \usepackage{multicol} \newcommand{\tabitem}{~~\llap{\textbullet}~~} %\restylefloat{table} \usepackage[table]{xcolor} \usepackage{multirow} \usepackage{perpage} \MakePerPage{footnote} \usepackage{abstract} \usepackage{graphicx} \usepackage{setspace} % Create hyperlinks in bibliography \usepackage{hyperref} \usepackage{amsmath} \usepackage{booktabs} \usepackage{tabulary} \usepackage[pass]{geometry} \usepackage{pdflscape} \usepackage{graphicx} \usepackage[T1]{fontenc} \usepackage[utf8]{inputenc} \usepackage{blindtext} \setkomafont{disposition}{\normalfont\bfseries} \usepackage{etoolbox} \usepackage{titlesec} \setcounter{secnumdepth}{4} \titleformat{\paragraph} {\normalfont\normalsize\bfseries}{\theparagraph}{1em}{} \titlespacing*{\paragraph} {0pt}{3.25ex plus 1ex minus .2ex}{1.5ex plus .2ex} \graphicspath{{./resources/}} \addbibresource{~/Documents/library.bib} \DeclareMathOperator*{\argmax}{arg\,max} \DeclareMathOperator*{\argmin}{arg\,min} % Fix for medeley's rubbish underscore handeling in generated bib files \DeclareSourcemap{ \maps{ \map{ % Replaces '{\_}', '{_}' or '\_' with just '_' \step[fieldsource=url, match=\regexp{\{\\\_\}|\{\_\}|\\\_}, replace=\regexp{\_}] } \map{ % Replaces '{'$\sim$'}', '$\sim$' or '{~}' with just '~' \step[fieldsource=url, match=\regexp{\{\$\\sim\$\}|\{\~\}|\$\\sim\$}, replace=\regexp{\~}] } \map{ % Replaces '{\_}', '{_}' or '\_' with just '_' \step[fieldsource=url, match=\regexp{\{\\\#\}|\{\#\}|\\\#}, replace=\regexp{\#}] } } } %\newsavebox{\abstractbox} %\renewenvironment{abstract} % {\begin{lrbox}{0}\begin{minipage}{\textwidth} % \begin{center}\normalfont\sectfont\abstractname\end{center}\quotation} % {\endquotation\end{minipage}\end{lrbox}% % \global\setbox\abstractbox=\box0 } %\makeatletter %\expandafter\patchcmd\csname\string\maketitle\endcsname % {\vskip\z@\@plus3fill} % {\vskip\z@\@plus2fill\box\abstractbox\vskip\z@\@plus1fill} % {}{} %\makeatother \newcommand{\dtoprule}{\specialrule{1pt}{0pt}{1.4pt}% \specialrule{1pt}{0pt}{\belowrulesep}% } \newcommand{\dbottomrule}{\specialrule{1pt}{0pt}{1.4pt}% \specialrule{1pt}{0pt}{\belowrulesep}% } \DeclareCiteCommand{\citeyearpar} {} {\mkbibparens{\bibhyperref{\printdate}}} {\multicitedelim} {} % MATLAB Code block stuff... \usepackage{color} \usepackage{listings} \definecolor{dkgreen}{rgb}{0,0.6,0} \definecolor{gray}{rgb}{0.5,0.5,0.5} \lstset{language=Matlab, keywords={break,case,catch,continue,else,elseif,end,for,function, global,if,otherwise,persistent,return,switch,try,while}, basicstyle=\ttfamily, keywordstyle=\color{blue}, commentstyle=\color{gray}, stringstyle=\color{dkgreen}, numbers=left, numberstyle=\tiny\color{gray}, stepnumber=1, numbersep=10pt, backgroundcolor=\color{white}, tabsize=4, showspaces=false, showstringspaces=false} \usepackage[shortcuts]{extdash} \definecolor{codegreen}{rgb}{0,0.6,0} \definecolor{codegray}{rgb}{0.5,0.5,0.5} \definecolor{codepurple}{rgb}{0.58,0,0.82} \definecolor{backcolour}{rgb}{0.95,0.95,0.92} \lstdefinestyle{mystyle}{ keywords={}, numberstyle=\tiny, basicstyle=\scriptsize, breakatwhitespace=false, breaklines=false, captionpos=b, keepspaces=true, numbers=left, numbersep=5pt, showspaces=false, showstringspaces=false, showtabs=false, tabsize=2 } \lstset{style=mystyle} \begin{document} \newgeometry{lmargin=1.5cm} \begin{titlepage} \begingroup \setlength{\tabcolsep}{1.5cm} \begin{tabular}[c]{p{0.30\textwidth} | p{0.4\textwidth}} {\vspace{1.2cm} \Large School of Electronic Engineering and Computer Science \par} & {\vspace{1.2cm} \large Sound and Music Computing \newline Project Report \the\year \par}\\ & {\vspace{0.5cm} \Large \textbf{Extraction of Audio Features from PCG Signals for the Classification of Heart Abnormalities} \par}\\ \vspace{0.4\textheight} \includegraphics[width=5cm]{qmul_logo} & {\vspace{1cm} \large \textbf{Samuel Perry}}\\ & \multicolumn{1}{|r}{August \the\year} \end{tabular} \endgroup \end{titlepage} \restoregeometry \doublespacing \begin{abstract} A system is presented for the analysis and classification of pathological phonocardiogram signals. The project aims to provide a reliable and robust detection method to aid in the early detection of cardiovascular abnormalities. The project utilises a novel combination of feature extraction, optimisation and machine learning techniques to produce accurate results, demonstrating the clear potential for this methodology in the area of heart abnormality detection. \end{abstract} \renewcommand{\abstractname}{Acknowledgements} \begin{abstract} I would like to thank my supervisor, Dr.\ Tony Stockman, for his guidance and support throughout the project. I am also extremely grateful for the advice given by Robert Blaauboer. Finally, I would also like to thank John Thompson, Tom Pengelly and David Pengelly for their valuable support. \end{abstract} \tableofcontents \newpage \section{Introduction} Cardiovascular diseases are the most prevalent cause of death in Europe, accounting for 37.5\% of all fatalities in 2013~\parencite{Eurostat2016}. Traditionally, cardiac auscultation has been performed manually using a standard stethoscope, with the aim of detecting heart defects aurally. This has been a fundamental method for detecting heart valve disorders for over a century. However, auscultation is a skill that requires training and can only usually be performed by a medical professional, such as a GP. As a result, manual auscultation is significantly susceptible to human error~\parencite{Hanna2002}. Automation of this method using technology may provide a solution, and recent research has shown promise in this area. A large amount of research has focused on analysis of Electrocardiogram (ECG) signals. Although useful for detecting pathologies, ECG equipment is expensive and requires a trained professional for use. Therefore it is not currently feasible for developing countries and rural areas where there may be few physicians available. A comparatively affordable and non-invasive alternative is the Phonocardiogram (PCG)~\parencite[p.130]{Reed2004}. Typically recorded using an electronic stethoscope, a PCG signal is a recording of sound made as the heart contracts, analogous to the sound heard by physicians when performing cardiac auscultation manually. Automated auscultation could provide an initial diagnosis for heart defects without the need for a trained medical professional. This would allow relatively cheap equipment to analyse a patient's heart sound, and automatically recommend further inspection based on analysis. By providing earlier diagnosis of conditions that may have otherwise been overlooked, this technology could have a significant impact on reducing mortality rates as a result of heart conditions. \section{Related work} There are currently a wide variety of methods employed for the analysis and classification of PCG signals. Current methods can typically be divided into 3 areas, each of which are combined to create a full classification system. These areas are: signal preprocessing, signal segmentation, and feature extraction/classification. The performance and evaluation of complete systems are also discussed in section~\ref{Classification} \subsection{Signal preprocessing} There are a large number of factors that lead to variation in quality of PCG recordings: stethoscope type, make and model, its microphone/sensors, the position used to record (i.e.\ lower left sternal border, apex, pulmonic area, aortic area), built in filters/signal processing used by the stethoscope (i.e.\ noise filters, anti-tremor filters), medication that a patient may be taking, as well as many other aspects that may influence the recorded signal~\parencite[p.4]{Pavlopoulos2004}. This presents a significant issue when attempting to analyse and compare a database of signals, as variations in recordings and artefacts caused by factors other than heart sounds will most likely interfere with analysis and comparison methods. To account for this, pre-processing methods are widely used, aiming to standardise a database. This is also used as a way to accentuate features of the data that are expected to be relevant for classification.\\ A common method employed is the use of decimation and static filters to remove unwanted spectral content that is most likely noise~\parencite{Liang1997a, Homsi2016, Springer2016, Gupta2007}. This helps reduce higher frequency noise such as speech, microphone movement, breathing and other interference caused externally. Signals are commonly downsampled to around 1--4KHz, with anti-aliasing filter specifications varying across the literature. Generally, highpass chebychev or butterworth filters are favoured with cutoff frequencies ranging from 400--750Hz.\\ In addition, many methods decompose the filtered signal using wavelet based methods, such as the discrete wavelet transform (DWT)~\parencite{Liang1997a, Pavlopoulos2004}, continuous wavelet transform (CWT)~\parencite{Langley2016} or wavelet package decomposition (WPD)~\parencite{Liang1998}. These methods are commonly used to separate components of a signal based on their spectral content. Wavelet transforms are popular as, unlike Fourier transforms, they are well localised in both the time and frequency domain. This allows for the analysis of PCG signals across multiple frequency bands whilst maintaining transient temporal events in the resulting decomposition~\parencite[p.93]{Ari2008}. This may be used for analysis of transient events such as murmurs, that may consist of higher frequency components than normal heart sounds. \subsection{Signal segmentation}\label{Segmentation} Algorithms for the segmentation of PCG data aim to extract the structure of the signal over time. This is a key stage in the analysis of PCG signals, as the structure of the signal and relationships between the fundamental heart sounds (FHSs) form the basis for much of further analysis performed on PCG data.\\ % TODO: insert segmented graph of PCG cycle A number of methods exist for the extraction of FHSs. Traditional methods rely on direct extraction of peaks from amplitude envelopes in the time domain to determine the structure of a signal. These methods perform various processing/transformations in order to accentuate the transient events with the intention of isolating them.\\ Early work in this area by Liang et.\ al described a method using the popular Shannon energy envelope, achieving good accuracy across 37 recordings of children~\parencite{Liang1997b}. The algorithm aimed to segment the data by first extracting the envelope, then applying adaptive rule based thresholds, to determine peaks corresponding to segmentation points. When comparing results to hand-annotated ground truth data, the system achieved a reported accuracy score of 84\%. However, due to the small sample size, and potential lack of noise in the database used, this may not translate well to a larger database recorded in sub-optimal conditions.\\ More recent methods used spectral representations to assist in the splitting of the FHSs, in particular using wavelet decomposition. These methods tend to perform more robustly on signals of varying conditions.\\ Building on previous work, Liang et.\ al presented an improved method, using the discrete wavelet transform to decompose and reconstruct the signal into 7 distinct frequency bands~\parencite{Liang1997a}. Applying a similar method of envelope extraction and peak picking to each frequency band, the best estimate of all frequency bands is then chosen as the final result. Criterion for this choice is based on the number of S1s and S2s detected, and the number of artefacts discarded for each frequency band. This method achieved an improved accuracy of 93\% across a larger database of 77 recordings. This suggests that the algorithm is as robust if not more so than previous work by Liang et al.\\ Vepa et al.\ proposed a wavelet decomposition based method that uses a combination of simplicity and envelope features~\parencite{Vepa2008}. This approach attempts to improve robustness when analysing signals of varying quality by using multiple complimentary features. This allows the method to base decisions on a variety of statistical properties. Evaluating the algorithm on a collection of 160 heart cycles from a variety of sources, a reported accuracy of 84\% was achieved.\\ More recently, a variety of machine learning methods have been implemented with reasonable success. Gupta et al.\ presented a method that applies $k$-means clustering to replace standard threshold-based methods for determining peak classification in a segmentation algorithm based on standard envelopes~\parencite{Gupta2007}. This achieved a reported accuracy of 90.29\%. Due to the envelope-based method for feature extraction, this method is still susceptible to noise and artefacts that occur within the frequency bands of the heart sounds.\\ Sepehri et al.\ proposed a method that combines neural networks with Power Spectral Density (PSD) estimates~\parencite{Sepehri2010}. By exploiting the periodic nature of S1 and S2 heart sounds, combined with their narrow frequency range, a neural network is trained to separate these sounds from other events, such as noise and murmurs. This method achieved a reported 93.6\% accuracy on a significantly larger database than previous methods detailed.\\ The most significant successes in segmentation algorithms have been observed through use of probabilistic models such as Hidden Markov Models (HMMs). Early research using these models by Ricke et al.\ utilised embedded HMMs to model the 4 states of the PCG and their transitions~\parencite{Ricke2005}. MFCCs and Shannon Energy were used as feature vectors for the models. Results of 98\% accuracy were reported, although this was tested on only a small database of signals.\\ Gill et al.\ achieved similar results, most notably with specific consideration for the duration of each state in the HMM~\parencite{Gill2005}. This is handled through the extraction of 6 duration features based primarily on peaks. These features form vectors for training the HMM. Results of 98.6\% sensitivity, 96.9\% positive predictivity for S1 sounds and 98.3\% sensitivity, 96.5\% positive predictivity for S2 sounds is reported. The issue of state duration was further addressed by Schmidt et.\ al through use of a Duration-dependent Hidden Markov Model (DHMM)~\parencite{Schmidt2015}. The DHMM is a modified HMM that considers the duration of the current state when calculating the probability of transition to another state. This modification scored a reported sensitivity of 98.8\% and a positive predictivity of 98.6\%.\\ Building on previous work using HMMs, Springer et al.\ presented a segmentation algorithm by using Hidden Semi-Markov Models (HSMMs) in combination with logistic regression~\parencite{Springer2016}. Use of Hidden semi markov model allows for a priori information on the duration of the current state to be used in probability calculation of the subsequent state. In this case, the knowledge that there is an upper and lower limit on the duration of each component is used in the calculation of transition probabilities. A modified viterbi algorithm is then used to calculate the most likely set of transitions based on observed features. Logistic regression is used to improve discrimination between state features when compared to discriminatory methods used by previous work. Performance was evaluated on a significantly larger database than previous methods and achieved a reported accuracy of $95.63\% \pm 0.85\%$. Due to it's rigorous evaluation and high accuracy, this method is currently considered the state-of-the-art for PCG signal segmentation.\\ Table~\ref{SegmentationTable} provides a brief overview of significant research into PCG segmentation. For a more complete summary of the current state of PCG segmentation, please refer to Liu et.\ al~\parencite{Liu2016} \newgeometry{margin=1cm} % modify this if you need even more space \begin{landscape} %\centering \begin{table}[htbp] \captionof{table}{Summary of Segmentation Algorithms} \label{SegmentationTable} \scriptsize \rowcolors{1}{gray!15}{white} \doublespacing \begin{tabulary}{\linewidth}{LLLLL} \dtoprule Author & Method & databases & \mbox{Reported} Results & Notes \\ \bottomrule Springer et.\ al \citeyearpar{Springer2016} & HSMM, Logistic regression & 10,172s of recordings from 112 patients. 12,181 first and 11,627 second heart sounds. & $95.63\pm0.85\%$ & Supervised algorithm. \\ Huiying et.\ al \citeyearpar{Liang1997b} & Normalised average Shannon energy envelope, peak picking & 37 recordings, 14 pathological murmurs and 23 physiological murmurs. 515 cycles & $91.03\%\;Ac$ & Unsupervised Algorithm. database consists entirely of child recording. Optimized on full database \\ Vepa et.\ al \citeyearpar{Vepa2008} & Wavelet decomposition, energy and simplicity measurement & 160 heart cycles collected from a variety of sources (training CDs, web resources) & $84\%\;Ac$ & Unsupervised Algorithm, Optimized on full database \\ Sun et.\ al \citeyearpar{Sun2014} & Viola integral envelope extraction, short-time modified Hilbert transform, peak picking & 6949s of recordings, from 121 patients & $97.37\%\;Ac$ & Supervised algorithm. Tolerance for segmentation accuracy not specified \\ Sepehri et.\ al \citeyearpar{Sepehri2010} & Spectral density estimation, auto-regressive parameters, multi-layer perceptron neural network & 120 recording, from 60 patients & $93.6\%\;Ac$ & Supervised algorithm \\ Ricke et.\ al \citeyearpar{Ricke2005} & Shannon energy (and related features), HMM & 9 recordings, from 9 patients & $98\%\;Ac$ & Supervised algorithm \\ Schmidt et.\ al \citeyearpar{Schmidt2015} & DHMM, Auto-correlation duration features, Homomorphic envelogram & 113 recordings, from 113 patients. 8s per recording. 15 abnormal recordings & $98.8\;Se,\;98.6\;P_+$ on test set & All data recorded ``lateral to the sternum in the fourth intercostal space on the left side''. Mix of noisy and clean recordings. 40 recording used for training, 73 for testing \\ Gill et.\ al \citeyearpar{Gill2005} & Homomorphic envelogram, Embedded HMMs & 44 recording, 17 subjects. 30-60s per recording & $98.6\%\;Ac, 96.9\;P_+$ for S1. $98.3\;Ac,\;96.5\;P_+$ for S2 & Recording taken in sub-optimal environments (noisy hospitals, offices etc...) \\ Gupta et.\ al \citeyearpar{Gupta2007} & Homomorphic filtering, $k$-means clustering & 41 patients, 340 heart cycles. 110 normal, 124 systolic murmur, 106 diastolic murmur & $90.29\%\;Ac$ & Unsupervised Algorithm. \\ \hline \dbottomrule\\ \end{tabulary} $Ac = \text{Accuracy}, Se = \text{Sensitivity}, P_+ = \text{Positive predictivity}$ \end{table} \end{landscape} \restoregeometry \doublespacing \subsection{Feature extraction/classification models}\label{Classification} A wide variety of methods exist for the extraction of statistical features and classification of PCG data. Most notably, the range of methods that were developed as a result of the recent Physionet/Computing in Cardiology (CinC) Challenge 2016 have improved the quality of abnormality classification in noisy signals. The challenge was assembled to provide researchers with a large database of normal/pathological PCG signals of varying quality. This enabled the development of algorithms that could be evaluated on a significant database, in order to determine performance across a range of conditions/signal qualities~\parencite{Clifford2016}. This section first details significant work produced prior to the challenge, then highlights key works produced for the challenge to outline the breadth of methods for robust heart sound analysis. \subsubsection{Work prior to the Physionet challenge} Work prior to the Physionet challenge was conducted predominantly with the aim of classifying specific heart conditions. Until recently, little research had been produced with regards to general abnormality detection, with many projects choosing to focus on specific conditions such as murmurs, atrial fibrillation, flutter, and heart valve disease. This section outlines some key research into these areas, alongside initial research into general abnormality detection.\\ Reed et al.\ implemented a simple general classification algorithm using artificial neural networks (ANNs) and wavelet decomposition~\parencite{Reed2004}. As initial work into this field, preprocessing such as segmentation is not performed and features remain relatively simple when compared to more recent methods. Also, due to the comparatively small sample size used for training (1 patient per abnormality, 4 cycles per patient), a reported accuracy of 100\% would have a strong possibility of generalising poorly. This does however, serve as an early example of limited success in general heart sound classification.\\ Maglogiannis et al.\ presented a classifier for discrimination of heart valve disease from regular heart sounds using an SVM (Support Vector Machine) classifier~\parencite{Maglogiannis2009}. Roughly 100 features were extracted from the signal, based on direct analysis of each heart cycle component (S1, Systole, S2, Diastole) and the average Shannon energy envelope of these components. A database of 198 heart sounds was curated for the project, acquired from 8 sources, such as medical CDs and pre-existing databases. An accuracy of 91.43\% was reported using 10-fold stratified cross-validation. In addition, the project aimed to classify individual abnormalities in a 3 step process, by distinguishing between systolic or diastolic murmurs, and then distinguishing between aortic or mitral diseases. The classifier achieved accuracy between 90-97\% for these classifications. This approach demonstrates the potential for a system to accurately distinguish between normal and abnormal heart sounds in a generalisable way, given carefully selected features.\\ Ari et.\ al also proposed an SVM based method for abnormality classification~\parencite{Ari2010}. A modified Least-squares SVM (LSSVM) is used in order to improve separability between normal and abnormal datapoints during training. 32 wavelet based features from previous literature are used as feature vectors for a modified LSSVM, un-modified LSSVM and a standard SVM. Comparison of the system shows that the proposed technique performs significantly better on all test sets with an accuracy of between 86\% and 100\%, dependent on database. This highlights the importance of choosing an appropriate classification method for achieving accurate results.\\ Quiceno-Manrique et al.\ demonstrate the use of various time frequency representations (TFR) such as short-time Fourier transform, wavelet transforms, Wigner-Ville distribution etc\ldots, with a $k$-nearest neighbour classifier (k-NN) for systolic murmur detection~\parencite{Quiceno-Manrique2010a}. This work highlights the effectiveness of alternative TFRs to traditional Fourier methods. This method also employs Principle Component Analysis (PCA), ie.\ the mapping of a high dimensional feature space to a lower dimension, in order to improve computational performance. Features were evaluated using a database of of 22 patients, 6 of which were labeled as having a systolic murmur. The highest reported accuracy was achieved using MFCCs as the primary feature vector achieving a 98\% accuracy on 10-fold cross validation.\\ Schmidt et al.\ aimed to find features that could be used for classification of coronary artery disease through detection of small murmurs~\parencite{Schmidt2015}. A large number of features are calculated to provide vectors for classification; Parametric spectral features such as ARMA are used, alongside instantaneous frequency and octave power measurements. Complexity features, such as sample entropy and simplicity, are also calculated in an attempt to exploit the likely stochastic nature of murmurs when compared to normal heart sounds. Given the large number of features calculated, PCA is used to retain only the most relevant information. Quadratic discriminant analysis (QDA) is then used as a classifier to provide a final accuracy score of 73\%.\\ An overview of significant research prior to the Physionet challenge is provided in table~\ref{SumPrior}. It is also noted that none of the databases used for prior research are publicly available. \newgeometry{margin=1cm} % modify this if you need even more space \begin{landscape} \begin{table}[htbp] \captionof{table}{Summary of research prior to the Physionet Challenge 2016}\label{PriorWorkTable} \scriptsize %\centering \rowcolors{1}{gray!15}{white} \label{SumPrior} \doublespacing \begin{tabulary}{\linewidth}{LLLLLL} \dtoprule Author & Pre-processing/segmentation & Features & Classification Method & Database & Reported Accuracy \\ \hline Maglogiannis et.~al \citeyearpar{Maglogiannis2009} & Wavelet decomposition, Shannon energy peak picking & Features derived from wavelet decomposition and PCG segmentations & SVM & 198 recordings, 38 normal, 41 AS systolic murmur, 43 MR systolic murmur, 38 AR diastolic murmur, 38 MS diastolic murmur & $91.43\%\;Ac$ \\ Ari et~al. \citeyearpar{Ari2010} & Amplitude envelope peak picking~\parencite{Ari2007} & Wavelet based features & LSSVM & 64 patients, 64 recordings, 512 cycles & $88.750-100\%\;Ac$ (dependant on abnormality type) \\ Quiceno-Manrique et~al. \citeyearpar{Quiceno-Manrique2010a}& Downsampled to 4KHz, Normalised to maximum of signal, ECG assisted QRS complex detection algorithm used for segmentation & Spectral features derived from STFT, Wavelet decomposition and quadratic energy distributions & $k$-NN & 22 patients, 16 normal, 6 abnormal, 8 recordings (12s) per patient & $98\%\;Ac$ \\ Schmidt et~al. \citeyearpar{Schmidt2015} & Signal filtered into frequency bands, Segmented by HMM based method+hand corrected, removal of high variance sub-segments to remove noise & Parametric spectral features (AR, ARMA and Music), Instantaneous frequency and amplitude, Power in octave bands & QDA & 435 Recordings, 133 patients, 70 normal, 63 abnormal & $73\%\;Ac$ \\ Reed et~al. \citeyearpar{Reed2004} & --- & Wavelet decomposition coefficients, Manual feature reduction & ANN & 5 patients, 4 cycles per patient & $100\%\;Ac$ \\ \dbottomrule\\ \end{tabulary} $Ac = \text{Accuracy}$ \end{table} \end{landscape} \restoregeometry \subsubsection{Physionet challenge entries}\label{ChallengeEnt} \doublespacing The 2016 Physionet/CinC Challenge aimed to encourage development of heart abnormality detection algorithms by providing a large open database of PCG signal recordings, sourced from a variety of both clinical and non-clinical environments. (Further details on the database can be found in section~\ref{Database}. The complete specification is presented by Liu et al.~\parencite{Liu2016}). In addition, participants were provided with a state-of-the-art heart sound segmentation algorithm, as proposed by Springer et al.\ in Section~\ref{Segmentation}. Participants were then tasked with the creation of a classification algorithm that could robustly discriminate between healthy and unhealthy heart sound samples. The challenge received 348 entries in total, each of which was scored on a hidden test database using a Modified accuracy measure ($Acc$) as defined by Clifford et. al~\parencite{Clifford2016}: \begin{table}[htbp] \centering \caption{Output Classification} \label{OutputClassification} \doublespacing \begin{tabular}{llccc} \hline & & \multicolumn{3}{c}{Algorithm's Output} \\ \hline & & \multicolumn{1}{l}{Normal} & \multicolumn{1}{l}{Uncertain} & \multicolumn{1}{l}{Abnormal} \\ \multirow{4}{*}{Ground Truth} & Normal, clean & $Nn_1$ & $Nq_1$ & $Na_1$ \\ & Normal, noisy & $Nn_2$ & $Nq_2$ & $Na_2$ \\ & Abnormal, clean & $An_1$ & $Aq_1$ & $Aa_1$ \\ & Abnormal, noisy & $An_2$ & $Aq_2$ & $Aa_2$ \\ \hline \end{tabular} \end{table} \doublespacing Weights are calculated as: \begin{table}[H] \centering \doublespacing \begin{tabular}{ll} $Wa_1 = \frac{\text{Clean abnormal recordings}}{\text{Total abnormal recordings}}$ & $Wa_2 = \frac{\text{Noisy abnormal recordings}}{\text{Total abnormal recordings}}$ \\ $Wn_1 = \frac{\text{Clean normal recordings}}{\text{Total normal recordings}}$ & $Wn_2 = \frac{\text{Noisy normal recordings}}{\text{Total normal recordings}}$ \end{tabular} \end{table} Modified sensitivity ($Se$), specificity ($Sp$) and overall accuracy ($Acc$) are then calculated as: \begin{align*} &Se=Wa_1\frac{Aa_1}{Aa_1+Aq_1+An_1}+Wa_2\frac{Aa_2+Aq_2}{Aa_2+Aq_2+An_2} \\ &Sp=Wn_1\frac{Nn_1}{Na_1+Nq_1+Nn_1}+Wn_2\frac{Nn_2+Nq_2}{Na_2+Nq_2+Nn_2} \\ &Acc=\frac{Se+Sp}{2} \end{align*} This section summarises some of the key works presented for the challenge, including some of the most accurate models, and a baseline classifier provided to participants as a starting point.\\ This baseline classifier was provided in order to demonstrate the basic structure of systems expected for entries~\parencite{Liu2016}. The classifier extracted a selection of 20 basic features, primarily focused on relative timings and amplitudes of heart sounds. A binary logistic regression model was chosen for classification. From the 20 extracted features, 13 were selected based on their statistical significance, measured using forward likelihood ratio selection. The system achieved a reported score of 66\% on the test set, giving a baseline score for participants to build on. In addition, the system was trained using leave-one-out cross validation. By removing a single training database on each fold, the generalisation of the algorithm could then be evaluated, training on all other databases. Results showed that performance decreased significantly when training via this method, giving an average accuracy of 59\%, with training database $b$ scoring as low as 47\%. This could suggest that individual databases in the database are not sufficiently represented by other databases, or that features do not model abnormalities sufficiently.\\ Homsi et.\ al proposed a system that utilised 131 time domain, STFT-based and wavelet-based features which, when combined with nested ensemble classifiers, produced an accuracy score of 84.48\%~\parencite{Homsi2017}. This algorithm combines many commonly-used features in previous PCG related literature such as wavelet decomposition based features, MFCCs and Shannon Energy. The system also uses a total of 40 classifiers, 20 for signals labelled to be `standard' and 20 for those labelled as `atypical'. A mixture of Random Forrest, LogitBoost and Cost-Sensitive Classifiers (CSC) are used to classify signals in parallel. Final results are combined using a rule-based decision, designed through manual experimentation.\\ Potes et al.\ presented a similar approach to that of Homsi et al.~\parencite{Potes2016}. 124 similar TFR features were extracted and used as vectors for an AdaBoost classifier. This classifier was then combined with a deep learning approach using a Convolutional Neural Network (CNN) classifier. The signal was decomposed into 4 frequency bands and segmented, to provide input to the CNN. Results from both AdaBoost and CNN classifiers were then combined using a set decision rule. This method produced the highest score on the test set for the challenge at 86.02\%.\\ Zabihi et al.\ took an alternative approach by choosing not to segment PCG data in the pre-processing stage~\parencite{Zabihi2016}. This was with the intention of reducing computational complexity of the resulting algorithm. In addition, the proposed method utilizes a sequential forward feature selection (SFS) and Linear Predictive Coefficients (LPC) for the reduction of features used for classification. This benefits the system by removing correlated and irrelevant features, thus reducing computational complexity and removing irrelevant noise from feature vectors prior to training. Final classifications are determined through cascaded ensembles of ANNs. The signal is first classified as either of high or low sound quality, and then as normal or abnormal. The system achieved a final score of 85.9\% on the hidden test set.\\ Plesinger et al.\ opted to develop a new form of machine learning algorithm based on probability assessment~\parencite{Plesinger2017}. In this method, features are mapped to histograms and thought of as probability distributions. Weights are applied based on the number of occurrences of each feature, which in turn generates a probability function. This can then be used to calculate the estimated classification of a new data point. From the 228 extracted features, 53 features were then selected based on calculated sensitivity and specificity scores using generated histograms. This allowed for the training scores to be automatically optimized by the algorithm.\\ Kay et al.\ present a method using ANNs, a wide variety of features and PCA for feature reduction~\parencite{Kay2017}. The algorithm scores well on the test set. However, this work is most notable for it's rigorous evaluation by authors, using leave-one-out cross validation for a clearer understanding of the generalisation of the algorithm, as well as highlighting issues with the underlying database that are discussed in Section~\ref{Database} \newgeometry{margin=1cm} % modify this if you need even more space \begin{landscape} \begin{table}[H] \captionof{table}{Summary of top 10 Physionet Challenge 2016 entries} \label{PhysionetTable} \scriptsize %\centering \rowcolors{1}{gray!15}{white} \doublespacing \begin{tabulary}{\linewidth}{CCCCC} \dtoprule Author & Features & Classification Method & Reported Scores & Challenge Score \\ \midrule Potes et.~al \citeyearpar{Potes2016} & 124 TFR features & Combined AdaBoost/ANN & In-house test set accuracy: AdaBoost-abstain: 79\%, CNN: 82\%, Combined classifiers: 85\% & 86.02\% \\ Zabihi et.~al \citeyearpar{Zabihi2016} & 40 temporal, spectral and TFR features, reduced using SFS and LPC & 2 ensembles of neural networks & Training accuracy: Maximum of 91.50\% & 85.90\% \\ Kay et.~al \citeyearpar{Kay2017} & CWT, MFCCs, complexity measures, Inter-beat features, PCA & ANNs & A range of cross validation based tests were used to analyse performance. See paper for full details & 85.20\% \\ Bobillo \citeyearpar{Bobillo2016} & MFCCs and WPD, reduced using tensor decomposition & $k$-NN & A range of cross validation based tests were used to analyse performance. See paper for full details & 84.54\% \\ Homsi et.~al \citeyearpar{Homsi2017} & 131 time domain, STFT based andwavelet based features & Combined ensembles of LogitBoost, Random Forrest and CSC & Training accuracy 87.7\%, In-house test accuracy: 93.24\% & 84.48\% \\ Maknickas et.~al \citeyearpar{Maknikas2017} & MFCCs, reduced by Karhunen–Loeve transform & Deep Neural Network & Training accuracy 99.7\%, Validation accuracy 95.2\% & 84.15\% \\ Plesinger et.~al \citeyearpar{Plesinger2017} & Statistical and symmetry properties of amplitude envelopes for S1 and S2 sounds & Custom probability assessment machine learning algorithm & Training accuracy 90.3\% & 84.11\% \\ Rubin et.~al \citeyearpar{Rubin2016} & MFCCs & Convolutional neural networks & -- & 83.99\% \\ Jiayu (paper not submitted) & -- & -- & -- & 82.82\% \\ Abdollahpur et.~al \citeyearpar{Abdolahpur2017} & time, TFR and perceptual features, reduced using Fisher's discriminant analysis & Combined ANNs & Training accuracy: 91.6\%, 87\%, 84.55\% (prior to ANN combination method) & 82.63\%\\ \dbottomrule\\ \end{tabulary} \end{table} \end{landscape} \restoregeometry \doublespacing \section{Database}\label{Database} %TODO: Briefly describe what is needed from a database for this project A database representative of real-world PCG signals was needed to train models and evaluate the proposed method effectively. A number of criteria were identified as necessary for the success of the proposed project: \begin{itemize} \item The database must contain sufficient PCG data, so that a model trained to discriminate between said signals would, in theory, generalise to new PCG data. \item The database should contain a mixture of clean and noisy signals that represent a variety of real world situations, as real-world classification would likely be performed in sub-optimal conditions. If this is not possible, noise could potentially be added to clean signals to simulate this. \item It must be possible to differentiate healthy signals from a variety of individual pathologies, in order to provide a general abnormality detection algorithm. This should be reflected in the database through inclusion of a variety of signals representing different pathological heart conditions. \item Data must be reliably labelled in order to generate a reliable model (particularly when using machine learning methods, as in the proposed project). Labels should ideally be verified by a trained professional. \end{itemize} \noindent Two viable options were then considered based on the above criteria: \begin{enumerate} \item The Physionet challenge database \item Generation of a synthetic dataset via methods such as that proposed by Almasi et al.~\parencite{Almasi2011} \end{enumerate} Generation of synthetic data was considered, as few well-formed alternative databases exist, other than the Physionet challenge data. The database curated for the Physionet challenge was selected for this project, as it fulfilled the criteria sufficiently and posed less of a risk in terms of signal quality, due to all signals being produced in real-world environments. However, synthesis of PCG data remains an interesting possibility for improving evaluation of classification systems and could be considered for the generation of additional samples in future work. \subsection{Database summary} The selected database is significantly larger and contains a wider variety of signal conditions than any database used for previous research (as detailed in table~\ref{PriorWorkTable}). It is released as an open-source resource and is documented in significant detail by Liu et al.~\parencite{Liu2016}. The lack of any alternative databases, comparable in size or variety of content, perhaps makes this resource the current standard for PCG analysis projects. In addition, by replicating the conditions of the Physionet challenge, results can be directly compared with those of the challenge participant's, with the aim of understanding how the proposed algorithm compares to the current state of PCG analysis. \begin{itemize} \item The database consists of 6 sub-databases, labelled $a$ to $f$. \item These sub-databases have been sourced from a variety of professionals, over the course of a decade. \item A total of 3,240 recordings are included, created using varying equipment. \item 2575 recordings are labelled as normal, 665 are labelled as abnormal. \item All samples have been resampled to 2KHz \item Samples were recorded in a range of both clinical and non-clinical environments. \item Many recordings are corrupted with environmental noise, such as microphone friction, breathing, talking etc\ldots \item Sections of silence are present in some recordings, most significantly in database $e$. \end{itemize} \subsection{Considerations}\label{DBCons} There are a number of issues with the acquired database that have been highlighted, both through previous literature and through development of the proposed system. These have been considered throughout development and evaluation of the project.\\ A significant issue highlighted by Liu et al.\ is the large number of normal recordings compared to pathological recordings. This creates a clear class imbalance issue that can result in over-inflated classification results. This is considered in Section~\ref{Resample}.\\ Another key issue is the difference between the databases used by participants of the Physionet challenge, and the available data that was acquired for this project. For unknown reasons, information such as patient labels used for training many of the challenge participant's models have not been made publicly available and so could not be used for training of the proposed system.\\ The lack of access to the hidden test set used for evaluating challenge entries also had a significant impact on evaluation. An alternative method for evaluating using only the data provided has been proposed in Section~\ref{metrics}.\\ Finally, an issue is highlighted by Bobillo with regards to database $e$~\parencite{Bobillo2016}. The recording of normal and pathological signals using separate devices is likely to cause issues and is discussed in Section~\ref{Eval}. %BEGIN NEW MATERIAL \pagebreak \section{Design} This project aims to provide robust heart abnormality detection for PCG signals, such that use of the system could reliably recommend further medical attention when necessary. It is clear from previous research that machine learning methods for classification have shown the most promise in this area, and that ensemble methods have been largely successful in improving classification accuracy of base classifiers~\parencite{Homsi2017, Potes2016}. However, one such method that has recently shown significant success in other fields, but is not present in recent PCG analysis literature, is the stacking meta-classifier~\parencite[p.498]{Tobergte2013a}. The presented system was therefore designed to explore the potential of this classification method in the context of PCG signal classification. This section details the four key components developed to form the final system: signal preprocessing (Section~\ref{preprocessing}), audio feature extraction (Section~\ref{featEx}), classification (Section~\ref{class}) and optimisation (Section~\ref{optimise}). % TODO: Create flow diagram Preprocessing -> Feature extraction -> % Model optimisation -> Performance evaluation \subsection{Preprocessing}\label{preprocessing} It quickly became apparent that, due to significant variations in the available data (as a result of noise, variations in recording equipment etc...), the effective preprocessing of such data would be a critical factor when designing the system. This section details the most significant preprocessing steps taken, in order to both minimize noise and extract the basic structure of the signal. \subsubsection{Signal decimation} A common method employed to simultaneously reduce computation time and remove extraneous information is to decimate the input signal by an integer factor. According to Shannon sampling theorem, a digital signal can only represent frequency content up to half the sample rate of the signal (the nyquist rate)~\parencite[p.140]{Kadis1999} Therefore, by removing every $n$th sample, high frequency content can be removed whilst lowering the number of samples that must be processed in subsequent operations. An anti-aliasing filter must also be applied to the signal in order to filter harmonic distortion generated by the process. As it is commonly stated in the literature that little relevant information in PCG signal is found above 400Hz, all signals were resampled to 1KHz giving a 500Hz cutoff frequency, using an 8th order zero-phase Chebyshev type I filter. \subsubsection{Dataset resampling}\label{Resample} A common issue with data collected from the real world (ie. non-synthetic data) is the imbalance of classes in data. As noted by Liu et al.~\parencite{Liu2016}, this is the case with the available dataset, as there are less pathological signals than healthy signals. This presents an issue with classification tasks, as imbalance can have a negative impact on classification of the minor class. In this context, class imbalance could potentially impact classification accuracy for abnormal samples, so must be handled appropriately. This issue can be approached using a number of methods. Sophisticated oversampling methods such as SMOTE (Synthetic Minority Oversampling Technique) offer one solution. SMOTE generates synthetic samples using interpolation and adds these to the data set to balance the classes, without using direct copies of existing data. However, oversampling techniques such as this can increase overfitting of models, and don't always offer reasonable improvement in performance~\parencite{Longadge2013}. Undersampling is the most common method used, typically by randomly removing samples from the major class. This has the obvious disadvantage of reducing data available for training. However, an improved method using $k$-Means clustering has been shown to be effective in previous cardiovascular classifications problems~\parencite{Rahman2013}. This method was seen to be the best choice for the proposed system. This method is illustrated using a small generated 2-dimesional dataset in Figure~\ref{cent}. \begin{figure}[H] \caption[caption of centroid]{Example resampling of synthesised dataset using cluster centroids\footnotemark} \makebox[\textwidth]{\includegraphics[width=\textwidth]{centroid}} \label{cent} \end{figure} \footnotetext{This figure was adapted from: \url{http://contrib.scikit-learn.org/imbalanced-learn/stable/}} \subsubsection{Signal segmentation} With one notable exception~\parencite{Langley2016}, previous classification algorithms rely heavily on the ability to segment signals into the four fundamental heart sounds. This is a key prerequisite to the extraction of relevant features. The defining of signal structure allows for the relationships between it's components to be analysed, as described in Section~\ref{featEx}. To facilitate the development of robust algorithms for the Physionet challenge, participants were provided with an implementation of Springer's HSMM based segmentation algorithm. As the highest scoring algorithm in the literature, it was clearly the most suitable algorithm to use for the proposed system. In addition to the high accuracy of segmentation, the wide adoption of this algorithm is beneficial for comparison with other algorithms submitted to the challenge. Results produced by the proposed system will generally not be coloured by the differences in quality of segmentation algorithms, allowing for more direct comparison of classification methods. However, it is noted that despite the high performance of the algorithm, errors in segmentation will still occur, that may have a negative impact on feature quality. As methods proposed by previous literature, such as hand correction by a professional~\parencite[p.2203]{Liu2016} are not feasible in this context, and considering the low number of erroneous results produced by the algorithm~\parencite[p.2]{Goda2016} it was decided that these errors would not pose a significant problem. An illustration of PCG data segmentation can be seen in Figure~\ref{segs}. \begin{figure}[H] \caption{Example segmentation of PCG data} \makebox[\textwidth]{\includegraphics[width=\textwidth]{segs}} \label{segs} \end{figure} \subsection{Feature extraction}\label{featEx} The extraction of feature vectors from data is a fundamental component of most machine learning based systems. The aim is to construct meaningful representations of the data that emphasize information relevant to the classification problem. In the proposed project, 188 features were extracted from the data, procuring feature extraction techniques from a wide range of previous literature, as well as using novel perceptual features commonly found in audio/music analysis (See Sections~\ref{FFT} and~\ref{Time}). There are also potential issues that can occur when using large sets of features for training. The method proposed for addressing these issues is discussed in section~\ref{SFS}. This section provides a summary of the main feature categories. Please refer to appendix~\ref{appendixA} for a full breakdown of all features. \subsubsection{Time-domain features}\label{Time} A range of features were generated, based directly on the time series data. Features such as: \begin{itemize} \item Average and standard-deviation of segment intervals, for all heart sounds and complete heart cycles. \item Ratio of systolic and diastolic period to total heart cycle period. \item A range of statistical features such as entropy, skewness and variance for each heart sound. \item A selection of envelope based features for each heart sound. \end{itemize} 18 features provided by the Physionet challenge focused on timings between segments of the heart cycles. It was thought that these features would be useful in capturing irregularities caused by conditions such as arrhythmias, atrial septal defect and other conditions that are likely to affect relative timing of heart sounds. Many conditions that can be detected by traditional auscultation are characterised by an increase in loudness of the S1 and/or S2 heart sounds~\parencite{Brown2008}. This suggests that features relating to human perception of loudness may aid in the detection of such conditions. Simple envelope based features such as RMS, peak loudness and the Shannon energy envelope (Equation~\ref{ShanEQ}) that proved popular in previous literature, were extracted for this reason~\parencite[p.73-77]{Lerch2012}. In addition, statistical features such as sample entropy and skewness (Equation ~\ref{SkewEQ}) were used to evaluate the distribution of samples for each heart sound, these were selected to provide a representation of the temporal ``shape'' of each sound. \begin{equation}\label{ShanEQ} SE = \frac{-1}{N}\sum\limits_{n=0}^N x(n)^2\cdot \log{x(n)^2} \end{equation} \begin{equation}\label{SkewEQ} S=\frac{E(x-\mu)^3}{\sigma^3} \end{equation} Where:\\ $x(n)$ is the input signal\\ $E(t)$ is the expected value\\ $\mu$ is the mean of the signal\\ $\sigma^2$ is the variance of the signal \subsubsection{FFT-based features}\label{FFT} It was recognised that a time domain representation alone was unlikely to provide a sufficient representation for discerning a wide variety of conditions. Using a time-frequency representation to characterise the spectral components of the signal has proven effective in the majority of literature. The classic method for producing a spectral representation of a signal is to apply the Discrete Fourier Transform (DFT) (as defined in Equation~\ref{FFTEQ}) over a sliding window of size $N$. By decomposing the signal into a series of sine and cosine waves, a representation of the signal's spectral content across a range of frequency bands is produced. This can be used for further analysis of heart sounds based on their spectral characteristics. \begin{equation}\label{FFTEQ} X(k)=\sum\limits_{n=0}^{N}x(n)e^{\frac{-j2\pi kn}{N}} \end{equation} Where $x(n)$ is the input signal\\ Features generated using this representation would, in theory, be useful for identifying conditions that reside in specific frequency bands, such as murmurs, for example~\parencite{Sepehri2010}.\\ An example of such features are Mel-Frequency Cepstrum Coefficients (MFCCs). Popular in speech processing, MFCCs provide a compact representation of a signal's spectral shape. MFCCs are calculated by first applying $N$ (a user-defined parameter) triangular filter banks, spaced using the mel scale to the magnitude spectrum. Applying a discrete cosine transform to the log of the filterbank outputs provides the final set of coefficients (for further details, please refer to~\parencite{Lerch2012}). This analysis creates a perceptually relevant representation of spectral shape, in effect mimicking the way in which humans might perceive the spectral shape of heart sounds. The reasoning for this is that, as the aim is to provide a system with performance better than, or equal to to that of a human, features that mimic what a human perceives may prove effective at distinguishing conditions in the way that a human does. This has shown to be effective in previous literature, with multiple systems utilising perceptual features with success~\parencite{Ortiz2016, Rubin2016, Quiceno-Manrique2010a}. 13 MFCCs were calculated for each heart sound and averaged to provide 13 features per sample.\\ %TODO: Generate MFCC spectum In addition to MFCCs, other statistical features were extracted from the spectrum such as spread, skewness, kurtosis and flatness. These features aim to provide alternate spectral measurements to MFCCs, in a similar way to their temporal counterparts as described in Section~\ref{Time}. Although the Fourier representation of PCG signals has proven effective in many cases, there are drawbacks of this representation that must be considered. One key issue that is inherent of Fourier transforms is the time-frequency trade-off. An increase in frequency resolution will always result in a decrease in temporal resolution. This poses a problem, as it is not possible to localize transient events accurately in the frequency domain using this method. This method may also suffer in the presence of background noise common in PCG signals. Previous studies have shown that these factors may have a significant impact when detecting conditions such as Coronary Stenoses~\parencite{Ergen2001, Akay1990} \subsubsection{Wavelet decomposition features} The wavelet transform has been used effectively as an alternative time-frequency representation to Fourier methods. The fundamental concept of the wavelet transform is to represent an input signal as a set of scaled and shifted finite oscillations. By comparing the signal with each scale of wavelet at all points in time, a set of $N\times A$ (Where $A$ is the number of scales) coefficients are generated. These define the scale and position needed for each wavelet in order to fully reconstruct the signal (This is illustrated in Figure~\ref{wave}. For further details, refer to~\parencite{Polikar1994}). The benefit of this transform is that it is well localized in both time and frequency domains. This allows for accurate representation of transient events such as clicks and snaps that are characteristic of heart conditions such as Mitral valve prolapse or stenosis~\parencite{Brown2008}.\\ For the proposed system, a 5 level DWT using daubechies wavelets-4 mother wavelet was used for decomposition and reconstruction. Statistical features such as entropy were then calculated, both on the reconstructed signal and directly on coefficients to attain a total of 48 features.~\parencite{Homsi2016} % TODO: Insert wavelet diagram here \begin{figure}[H] \caption{Example 5 level daubechies 4 wavelet decomposition and reconstruction (normalised). Plots in descending order: D1, D2, \ldots, D5, A1} \makebox[\textwidth]{\includegraphics[width=1.0\textwidth]{wavelet}} \label{wave} \end{figure} \subsubsection{Feature scaling and imputing} A common problem when working with multiple features is the difference in scale between features. This problem can cause many machine learning algorithms to place bias on larger scale features and can significantly impact the time taken for certain algorithms to converge. This is particularly significant when applying algorithms sensitive to feature scale such as SVMs (described in Section~\ref{SVM}). To address this, a Min-Max scaler was applied to training and test sets prior to training models. This scales all values to within a 0--1 range producing a set of features on a common scale.\\ It is also common to encounter missing values in features. These can occur as a result of $\log(0)$ or division by 0 calculations, amongst other edge cases. A standard method for handling these values is to apply an imputer, replacing values with the mean of the feature vector~\parencite{VanderPlas2017}. \subsection{Stacking classifier with cross-validation}\label{class} The stacking classifier is an ensemble classifier, that uses the results of multiple base classifiers as input to a 2nd level meta-classifier, which in turn is used to generate a final prediction. $k$-fold cross validation is used across base classifiers, training on $k-1$ folds of input data, and applying to the remaining validation set. The results of these predictions from each base classifier are combined and used to train the 2nd level classifier which produces the final predictions based on the probabilities and predictions provided. This is illustrated in figure~\ref{stack}\\ Given it's proven accurate performance across a range of tasks, it was expected that this classification model could be applied effectively to produce an alternative method for abnormality detection than those presented in previous literature. % TODO:Insert stacking classifier diagram \begin{figure}[H] \caption[caption of stack]{Stacking classifier overview\footnotemark} \makebox[\textwidth]{\includegraphics[width=0.5\textwidth]{stacking_cv_classification_overview}} \label{stack} \end{figure} \footnotetext{Figure retrieved from:\url{http://rasbt.github.io/mlxtend/user_guide/classifier/StackingCVClassifier/}} \subsubsection{Base classifiers} Clearly, an important consideration when using any ensemble method is the selection of the base classifiers. In order for any ensemble method to perform well, it must be constructed using a selection of classifiers that individually provide useful models for the data~\parencite[p.484]{Tobergte2013a}. A wide variety of models were considered for use as base and meta models including models such as Tree based, $k$-Nearest Neighbour, and AdaBoost classifiers. Selection of these models was based on a novel approach using hyperparameter optimisation as discussed in Section~\ref{optimise}. The following sections detail the 3 final models selected by the optimisation algorithm; A combination of SVM and Naive-Bayes classifiers, with a Logistic Regression meta classifier. \paragraph{SVM}\label{SVM} The SVM classifier aims to fit a hyperplane to data that maximises the separability between classes. This results in a model that has been shown to generalise well in many cases, as maximising separability between classes is also likely to increase the margin for error in separation of classes. This type of classifier is also able to generate hyperplanes in non-linear space, using a techniques known as `kernel tricks'. This works by mapping linear data to a higher dimension, allowing non-linearly separable classes to be separated by the same method. The details of the SVM and Kernel-SVM are complex and outside the scope of this report. Further information can be found in~\parencite[p.187]{Tobergte2013a}.\\ SVMs have been prevalent in previous literature, shown to be effective in separation of a variety of heart conditions~\parencite{Ari2010} The use of kernels to map parameters to higher dimensions is a key advantage of this model, allowing for non-linear relationships that are likely to be present in the large variety of features to be well represented in classification. Choice of kernels, and relevant hyperparameters is detailed in Section~\ref{PSOp}. \paragraph{Naive-Bayes} Commonly used in text classification problems, where there is typically a high-dimensional feature space, Naive Bayes classification uses Bayes rule to determine the probability of classification, given a vector of features. This is calculated as: \begin{equation} P(y\mid x_1,\ldots,x_n)=\frac{P(y)\prod\limits_{i=1}^{N}P(x_i\mid y)}{P(x_1,\ldots,x_n)} \end{equation} The implementation used assumes a Gaussian distribution for all features, calculating the probability of a feature as: \begin{equation} P(x_i\mid y)=\frac{1}{\sqrt{2\pi \sigma_y^2}}\exp\bigg(-\frac{(x_i-\mu_y)^2}{2\sigma^2_y}\bigg) \end{equation} Where:\\ $\mu$ is the mean of the distribution\\ $\sigma^2$ is the variance\\ Using Maximum Likelihood estimation to estimate $\sigma$ and $\mu$ given the feature vector, a classification for new features can then be calculated as: \begin{equation} \hat{y}=\argmax\limits_y P(y)\prod\limits_{i=1}^nP(x_i\mid y) \end{equation} Where:\\ $x$ is the feature vector to be classified\\ $\hat{y}$ is the estimated classification\\ Despite their computational simplicity, Naive Bayes classifiers have been shown to produce highly accurate classifications models. The assumption that each feature is completely independent allows for extremely fast classification and scalability to large datasets, with many dimensions~\parencite[p.300]{Zhang2004}. It was thought that these benefits would make the classifier suitable for the proposed system, as the relatively high dimensionality of features and quantity of data points could then be classified quickly, to obtain initial results. Despite the inclusion of more complex models, this model was chosen via automatic selection for the final model. Refer to section~\ref{PSOp} for further details. \paragraph{Logistic regression} Logistic regression is a regression model that aims to fit as hyperplane to data points by minimizing a cost function using weighted features. By applying weights to feature vectors then applying a sigmoid function, a hypothesis function is defined as: \begin{equation} h_\theta(x)=\frac{1}{1-e^{-\theta^{T}x}} \end{equation} Where:\\ $x$ is a feature vector\\ $y$ is a class label vector \\ $\theta$ is a weight vector \\ A cost function can then be defined as: \begin{align} &J(\theta)=\argmin\limits_\theta\frac{1}{2m}\sum\limits_{i=1}^m\Big(h_\theta(x^{(i)})-y^{(i)}\Big)^2+\text{Regularization}(\theta)\\ &\text{Regularization}{(\theta)}_\text{L1}=\lambda\sum\limits_{j=1}^n\mid\theta_i\mid\\ &\text{Regularization}{(\theta)}_\text{L2}=\lambda\sum\limits_{j=1}^n\theta_i^2 \end{align} Where:\\ $\lambda$ is the regularization parameter used to help prevent overfitting\\ By minimizing the cost function, classification predictions can then be made using the hypothesis function~\parencite{Ng2012}.\\ Logistic regression was chosen as the meta-classifier primarily due to it's simplicity and performance in testing. It is thought that this algorithm performed particularly well as output from base classifiers was linearly separable and relatively simple (in comparison to input features). The choice of meta-classifier is a potential area for improvement and it is noted that a range of meta-classifiers have been proposed for different tasks that utilise stacking~\parencite[p.29]{Sesmero2015}. Further work in this area could potentially provide improved results. \subsection{Model optimisation}\label{optimise} As discussed in previous sections, two of the most important aspects that affect the performance of a classification system are it's models, and the input features. A combination of relevant features and well tuned models is therefore likely to provide an accurate classification system. However, it is not always immediately clear which values to choose for parameters, or which features to use as inputs. This is especially true when given such a wide selection of models to choose from, and such high dimensional feature spaces, as are used in the proposed system. To address this issue, two automatic optimisation approaches were implemented, with the aim of maximising the accuracy of the proposed system. \subsubsection{Sequential feature selection}\label{SFS} It was recognised that the extraction of such large numbers of features in the proposed system would likely result in a large amount of redundant information. There are two commonly used methods for addressing this problem: feature reduction and feature selection. Feature reduction involves reducing features to a lower dimensionality using techniques such as PCA. Conversely, feature selection involves selectively removing features entirely via methods such as Sequential Floating Forward Selection (SFFS). Both aim to reduce the amount of redundant information in features by removing or reducing features that are not expected to benefit the model. As a selection of models were to be used, each potentially handling dimensionality differently (SVMs in particular), it was decided that feature selection would be most appropriate for this application.\\ Through experimentation, the chosen method was SFFS. This method is an adaptation of tradition sequential forward selection, that also uses sequential backward selection to allow for subsequent removal of added features when necessary. SFFS is an iterative wrapper method that adds features and re-trains the chosen model sequentially, choosing features that increase the accuracy of the model output (using 3-fold cross validation to avoid overfitting). Final models used as few as 40 features, increasing both accuracy of classifications and computation time of models significantly.\\ It should be understood that this method does not guarantee a globally optimal set of features. An exhaustive feature selection algorithm is capable of this but this would incur significant computational cost. For further details on SFFS please refer to~\parencite[p.3]{Ferri1994} \subsubsection{Particle swarm optimisation}\label{PSOp} The particle swarm optimisation algorithm is an iterative meta-heuristic algorithm that aims to find the set of parameters that maximises a given function. Given a $n$ dimensional parameter space, the algorithm randomly initialises sets of `particles' representing random combinations of parameters. As the algorithm progresses particles travel through the parameter space, updating their position based on their velocity, best historical score and the best historical score of the swarm. As the algorithm iterates, particles will converge on local optima, producing potential solutions. The best score is chosen after the final iteration as the best parameter selection. Annotated pseudocode for this algorithm is shown in code block~\ref{PSCode}~\parencite{Clerc2002} \onehalfspacing \begin{lstlisting}[escapeinside={(*}{*)}, label={PSCode}, caption={Particle Swarm Optimisation Pseudocode}] Do //For all particles... For (*$i$*)=1 to Population Size // If the current function score is better than the historical // function score for the current particle, store new position if (*$f(\overrightarrow{x}_i)>f(\overrightarrow{p}_i)\text{ then }\overrightarrow{p}_i = \overrightarrow{x}_i$*) // Store best position of all particles in neighbourhood (*$\overrightarrow{p}_g=\text{max}(\overrightarrow{p}_{\text{neighbors}})$*) // For each dimension in the parameter space... For (*$d=1$*) to Dimension // Update velocities (*$v_{id}=v_{id}+\phi_1(p_{id}-x_{id}+\phi_2(p_{gd}-x_{id})$*) // Ensure particle velocity is within limit (*$v_{id}=\text{sign}(v_{id})\cdot \text{min}(\text{abs}(v)_{id}, v_{\text{max}}))$*) // Update particle position (*$x_{id}=x_{id}+v_{id}$*) Next (*$d$*) Next (*$i$*) Until termination criterion is met \end{lstlisting} \doublespacing The use of this algorithm allowed for the efficient optimisation of all parameters relating to the stacking classifier and it's base classifiers, resulting in a finely tuned classification model that would not have been producible using traditional trial and error methods to search for optimal parameters.\\ During the initial design phase, it was found that the abundance of machine learning algorithms available make selection of the optimal model difficult, requiring in depth knowledge of a range of machine learning techniques. A novel approach used by recent stacking classifier applications has been in the use of meta-heuristic algorithm to select models automatically, in addition to tuning parameters. By thinking of the base classifiers as hyperparameters themselves, models can be swapped in and tuned automatically by the particle swarm algorithm to provide a locally optimal selection of base classifiers for the model~\parencite{Sesmero2015}. This technique was used to pick the 3 final models described in section~\ref{class} from a selection of 6 models\footnote{The available models for selection were: Random Forrest, $k$-Nearest Neighbour, Logistic Regression, Naive-Bayes, SVM (kernel based and linear models) and AdaBoost classifiers, implemented using Scikit-learn}. This dynamic selection of models was seen to be one of the key contributors to the overall success of the algorithm. As with the chosen feature selection algorithm, this method of optimisation is not guaranteed to provide a globally optimal solution. It was thought that for the proposed system a locally optimal system would suffice, particularly given the highly complex parameter space used in implementation. This is discussed in detail in Section~\ref{ModOp}. \subsection{Model performance evaluation method}\label{metrics} In order to fully understand the performance of the system (and to evaluate the impact of design decisions throughout development), a group of scoring methods were implemented to test the system's performance in a selection of scenarios. The aim was to provide reliable metrics that would highlight the system's strength and weaknesses and to provide quantifiable measures with which to compare the system to the range of alternative methods proposed in the literature.\\ One of the most basic metrics was the scoring of the trained model on a separate hold-out dataset. By reserving a selection of samples from across the databases, a trained model could then be scored on this dataset for accuracy, sensitivity and specificity (metrics described in Section~\ref{ChallengeEnt}) to determine the system's performance on an unseen set of samples. This method is widely used to provide a basic understanding of a model's ability to generalise to new data, a crucial requirement of the system. Data was split using a grouped stratified shuffle split, grouping by database. This ensured an equal number of randomly selected classes were taken from each database to produce training and test sets. This approach was taken to avoid class imbalance issues caused when there is a significant difference in class frequency, as detailed in Section~\ref{Resample}. It should be noted that although samples were stratified by class, it was not possible to stratify samples by patient. This may have an impact on results, as the presence of data from the same patient in both training and test set may artificially inflate results, where the model has learnt patterns specific to that patient that do not generalise to others.\\ A more robust method for for model evaluation is cross-validation. By splitting the full dataset into multiple folds, and training models on each, metrics can be calculated on each fold, and an average can be taken to provide a measure of the system's performance over all folds. 10-fold cross validation, stratified by class, was chosen for evaluation of the system. This provides an insight into the performance of the algorithm across the dataset. This is a common method used by all participants of the Physionet challenge and is commonly found in prior literature.\\ It is highlighted by Homsi et.\ al, that a large amount of variance may be observed across folds~\parencite[p.1637]{Homsi2017}. This is attributed to the variations across databases, making generalisation difficult. To account for this, it is suggested that cross-validation is repeated multiple times and averaged to provide a more accurate measurement of performance across folds. For the proposed system, cross validation was repeated 10 times for each fold and averaged to produce the final results. Standard-deviation is also calculated across these iterations to illustrate the possible prevalence of this.\\ A common theme throughout the literature was that of generalisation across databases. It was observed that many previous algorithms achieved high accuracies in standard cross-validation, but performed significantly worse when testing on unseen databases~\parencite{Homsi2017, Bobillo2016}. For this reason, leave-one-out cross-validation was used to form a better understanding of the system's ability to generalise to unseen data from different sources. On each fold, a single database is removed, training on all other databases. This is a useful method as it can be used to determine the level to which information extracted from databases is representative of other databases.\\ The evaluation of models using cross-validation was not limited to final evaluation. Evaluation of intermediate models generated by both the SFFS and Particle Swarm algorithms was possible, by further separating the test set into test and validation fold. This technique was used in the optimisation of the final model to provide scores for intermediate models, as well as to help avoid overfitting parameters and features to the training set used for optimisation.\\ Discussion on the performance of the proposed system using these methods can be found in Section~\ref{Eval}. % TODO: Insert cross validation diagram from data science handbook \section{Implementation} This section describes the tools used in the realisation of the proposed system, the practical issues encountered throughout the implementation process and the development strategy taken to address and avoid such issues. Rationale is given for decisions made throughout production of the proposed system and any known issues with the current implementation are outlined. \subsection{Development strategy} Early in the design process it became apparent that in order for this project to produce reasonable results, it would need to utilise a number of complex algorithms to handle the various non-trivial problems that were encountered throughout development. Python was chosen as the most suitable language for the implementation of the system. High level language features such as dynamic types and automatic garbage collection, combined with the large variety of readily available packages and libraries, make the language a good choice for the fast, flexible development approach taken throughout this project.\\ The most significant objective from the outset of the project was to provide a system that could classify pathological signals with a degree of accuracy that was comparable to the current state of research in the field of PCG analysis. Given this focus and that the performance of the final product was initially unknown, it was recognised that the design of the project would need to adapt as the project progressed, implementing and testing high level concepts to iteratively improve performance of the system. For this reason a high level view of production was taken, choosing to focus on the overall system architecture, rather than spending great amounts of time on on any one specific element of the project (as any component of the project could be removed/replaced entirely, if this facilitated the improvement of results).\\ With this design ethos in mind, it was decided that external packages and libraries would be used/adapted wherever necessary to avoid spending large amount of time developing proprietary implementations of proven concepts. By not `reinventing the wheel', it would be possible to rapidly prototype and evaluate high level concepts, such as the variety of machine learning and optimisation algorithms detailed in previous sections, quickly and effectively. Due to it's active developer community, a wide range of scientific computing and machine learning algorithms are available, such as NumPy~\parencite{VanDerWalt2011}, SciPy~\parencite{Millman2011} and Scikit-Learn~\parencite{Pedregosa2011}, each of which was used extensively throughout the project alongside other packages detailed in the following sections. \subsection{System overview} The proposed system can be broken down into 5 key components: the user interface, feature generation module, classification module, optimisation module and evaluation module. The overall architecture of the system follows a common design pattern for machine learning based systems; Taking a set of input data, augmenting to produce associated data, extracting patterns from said data and evaluating to understand the effects of various implementation choices. The reason for this maintain focus on the central innovations of the project which are thought to be in the implementation of models and the analysis of input data. \subsubsection{User interface and project framework} As a project focused on producing a functional proof of concept for a specific problem, this project was not aimed at producing a user-facing application. For this reason, interactions with the system are based primarily around a simple, well documented commandline interface (CLI) that allows a user to specify relevant parameters through use of CLI flags (CLI output flags are detailed in Appendix~\ref{appendixB}). This gives the user relevant control over optional parameters that are situation specific (such as multiprocessing, specifying locations to save data etc\ldots) through an interface that is both easy to maintain throughout development and intuitive for a user familiar with command line applications. Relevant information is then presented to the user as the program progresses through the use of standard output and file based logging systems. Providing informative feedback to the user was essential, both during development for debugging processes, and to provide an understanding of the programs progress, particularly in long-running iterative processes used for optimisation.\\ A file based logging system was developed using Python's built-in logging module to allow for the monitoring of threaded processes. This allowed for detailed monitoring of the systems progress, even when running multiple operations concurrently.\\ A significant issue that developed as the project grew in size and complexity was the running time. As more complex methods were implemented for feature extraction and model optimisation, the time taken to process the relatively large dataset grew considerably. Primarily using Python's object pickling functionality, Methods were implemented to handle the creation and maintenance of intermediate feature and parameter files, allowing the program to load previously generated data. This helped in cutting computation times of subsequent runs significantly. This also provided a convenient facility for storing models, in a portable fashion for transfer between computers, for example. \subsubsection{Features extraction} As a fundamental component of the classification system, the feature extraction method required careful planning and development to realise the extraction of the large number of features. As a feature of particular importance, the integration of the MATLAB segmentation algorithm was a key part of the feature extraction method. Given a pre-made MATLAB implementation, the most problematic step was the integration of this code seamlessly with the largely incompatible Python framework. This was achieved through use of Python's Subprocess module, allowing for the code to be called through an external unix process. The nature of this implementation limits the compatibility of the system to unix only systems. However, given the nature of the project, this is not considered an issue.\\ Having extracted segmentation data, methods for the efficient calculation and storage of other features were considered. It was recognised that in order to maintain such a large number of features, an organised and efficient data format would be needed. The popular Pandas DataFrame library was chosen for this task. Built on the NumPy array object, DataFrames provide a labelled data structure, ideal for storing large quantities of labelled data. As the DataFrame is based on the Numpy array it is able to combine the powerful matrix operations of Numpy with intuitive database queries commonly seen in languages such as SQL. This data structure also allows for simplistic storing and loading of DataFrames through built-in interfaces to the HDF5 file system, simplifying the storage of large quantities of intermediate data.\\ A minor issue that was found during development was the lack of compatibility between DataFrames and Scikit-Learn (described in Section~\ref{Sklearn}). This simply required careful handling of operations between these APIs in order to avoid unintentional mishandling of data.\\ As mentioned previously, the calculation of features often required many mathematical operations to be performed on vectors. The use of Numpy array-like containers simplified this operation, allowing many features to be calculated in a single line of code. Standard operations were sufficient for a number of the features. However, mathematically complex features such as MFCCs and wavelet transforms required dedicated processing libraries. The open source pyWavelets and Librosa project were used for initial generation of MFCCs and the DWT respectively~\parencite{pyWave, Mcfee2015}. Further processing such as segmentation, entropy analysis etc\ldots is calculated to produce the final features. For a details of all features please refer to Appendix~\ref{appendixA}.\\ Given the large number of operation required for feature extraction, a large amount of time needed to compute features was an unavoidable consequence of the design. To help alleviate this issue, processing of features was parallelised, using each sample as an individual job. The speed-up aquired through parellisation is inherently dependant on the system running the program, however, this significantly reduced the computation time of features. A modified implementation of Python's multiprocessing module was used for task management. \subsubsection{Classification model generation}\label{Sklearn} The Scikit-learn machine learning package is a popular choice for implementation of machine learning algorithms in Python.~\parencite{Pedregosa2011} As a result, it is well maintained and benefits from a number of smaller projects that are designed to be compatible with it's API. Such projects include Mlxtend, designed to expand Scikit-learns range of features with more esoteric models and tools~\parencite{Raschka2016,} and Imbalance-learn, which addresses the lack of sample balancing transformations in Scikit-learn~\parencite{Lemaitre2017}. The project's aim to provide a standard interface to each of it's algorithms also aids in the quick prototyping of many machine learning algorithms on the features generated. During the initial stages of development, this allowed for models to be tested quickly to gain a rudimentary understanding of the performance of models. From this, it was possible to explore more complex models and ultimately resulted in the use of Mlxtend's Stacking ensemble classifier.\\ As the size of the classification codebase grew, so did the number of chained operations in the classification process. Scikit-learn's Pipeline utility was used to maintain correct handling of these process chains, allowing for models to be easily formed of multiple transforms and classifiers in a manageable manner. This gained particular significance with the implementation of model optimisation methods, as discussed in the following section. \paragraph{Model selection/optimisation}\label{ModOp} The decision to use an ensemble classifier also presented a complication, through the need to choose multiple complimentary base classifiers. A further issue was in the growing number of parameters that would need to be tuned to obtain optimal results from each of the chosen base classifiers. The adaption of a particle swarm optimisation algorithm was found to be an effective solution to both of these problems~\parencite{Claesen2014}. Using model pipelines to encapsulate all transformations and classifiers into a single object, it was possible to construct a dictionary of multiple potential Scikit-learn models for each base classifier. Implementation of a wrapper function around the full stacking classifier then allowed classifier pipelines to be treated as hyperparameters, which could be optimized on a training set alongside all active parameters for each base classifier. This resulted in the simultaneous selection and optimization of model combinations and hyperparameters.\\ Mlxtend's implementation of SFFS was also implemented to apply feature reduction to the dataset. This was initially implemented as part of the processing pipeline, resulting in feature reduction being applied on every iteration of parameter optimisation. This initially provided a score for each model based on the selection of it's optimal parameters. However, as models grew in complexity, the growing computational complexity resulted in this method becoming infeasible. This was addressed by re-implementing the feature selection algorithm after optimisation on the full dataset. The reduction in performance from this is not thought to be significant.\\ As with parameters, models were saved using a combination of Python's pickling functionality and Panda's HDF5 export methods, to create fully portable models. \subsubsection{Automatic system evaluation} In order to accurately place the system in the context of current research, evaluation metrics were needed to perform automatic testing of the system. Metrics were implemented as described in Section~\ref{metrics} using a custom multi-scorer object that was adapted to allow for the calculation of the 3 metrics: sensitivity, specificity and score. Using this object in conjunction with a selection of Scikit-learns cross-validation objects provided a mechanism for quickly evaluating models in an equivalent fashion to those presented in the literature. In addition, a simple test train split was implemented allowing for the use of a hold-out dataset. Performance on this dataset was evaluated directly using the custom scoring functions and Scikit-learn's model scoring methods.\\ Finally, results were formatted into tables and logged to provide instant feedback to the user on the performance of the current model. \section{Results}\label{Eval} The system was evaluated using 3 primary scoring methods (as described in Section~\ref{metrics}). \begin{itemize} \item Score on hidden test set \item Leave-one-out database cross-validation \item 10-fold stratified cross-validation \end{itemize} The final optimised model was generated using a total of 43 selected features out of a possible 188. Parameter optimisation was run with 1000 parameter evaluations, resulting in 50 iterations using 20 particles. Final parameters and selected features for the chosen algorithms are detailed in table~\ref{OpParam}.\\ The final scores produced for this model, evaluated using the full dataset, can be found in Table~\ref{TestSet}. Scores for Leave-one-out cross-validation and 10-fold cross-validation can be seen in Figure~\ref{fig1} and Figure~\ref{fig2} respectively. Details can be found in Appendix~\ref{appendixD}. \begin{table}[H] \centering \caption{Hidden test-set scoring} \label{TestSet} \begin{tabular}{@{}lll@{}} \toprule $Acc$ & $Se$ & $Sp$ \\ \midrule 80.31\% & 83.15\% & 77.47\% \\ \bottomrule \end{tabular} \end{table} % Make lists without bullets and compact spacing \renewenvironment{itemize}{ \begin{list}{}{ \setlength{\leftmargin}{1.5em} \setlength{\itemsep}{0.25em} \setlength{\parskip}{0pt} \setlength{\parsep}{0.25em} } }{ \end{list} } \setlist[enumerate]{itemsep=0.25em} \begin{table}[H] \centering \caption{Optimised model parameters and selected features} \scriptsize \label{OpParam} \begin{tabulary}{\linewidth}{LLLL} \toprule Base Classifier A & Base Classifier B & Base Classifier C & Meta Classifier \\ \midrule Model: Linear SVM & Model: RBF Kernel SVM & Model: Naive-Bayes & Model: Logistic Regression \\ C: 4.2507 & C: 4.9452 & & C: 14.3611 \\ & Gamma: 0.4558 & & Penalty: L2 \\ \bottomrule \end{tabulary} \singlespacing \begin{multicols}{5} \scriptsize \begin{itemize} \item AvrA5diaShan \item AvrA5s1Shan \item AvrA5s2Shan \item s2MFCC0 \item AvrD1s2Shan \item s2MFCC12 \item s2MFCC4 \item s2MFCC6 \item s2MFCC9 \item s2Max \item s2Mean \item AvrD4s2Shan \item AvrD5s2Shan \item s2ZeroX \item sd\_RR \item TPTs1 \item s2Dur \item sysMFCC2 \item sysMax \item sysSampEnt \item sysShanEngy \item sysSkew \item sysVar \item diaMFCC11 \item diaMFCC12 \item diaMFCC6 \item diaSampEnt \item diaShanEngy \item diaVar \item diaZeroX \item heartRate \item m\_RR \item m\_Ratio\_DiaRR \item mean\_IntS1 \item mean\_IntSys \item s1Dur \item s1MFCC11 \item s1MFCC4 \item s1Max \item s1Mean \item s1ShanEngy \item s1Var \item s1ZeroX \end{itemize} \end{multicols} \end{table} \begin{figure}[H] \caption{Leave-one-out cross-validation results (mean and std-dev)} \makebox[\textwidth]{\includegraphics[width=1.1\textwidth]{logo}} \label{fig1} \end{figure} \begin{figure}[H] \caption{Stratified 10-fold cross-validation results (mean and std-dev)} \makebox[\textwidth]{\includegraphics[width=\textwidth]{10_fold}} \label{fig2} \end{figure} Due to the replication of the approach taken for scoring entries to the Physionet challenge, it is possible to directly compare results to challenge entries. This aims to provide a thorough understanding of the performance of the proposed system in relation to other approaches. The system is further compared to some successful algorithms prior to the challenge in the subsequent section, in order to understand the performance of the system in a wider context of heart sound analysis.\\ The most directly comparable results are to those presented by challenge participants, used during the training of their algorithms. Many participants used similar cross-validation scores to determine the performance of their algorithm before testing on the final hidden dataset, and these provide a key insight into the performance with regard to a variety of aspects.\\ Results obtained using the Leave-one-out cross-validation scoring are similar to those of the highest scoring algorithms in the challenge~\parencite{Homsi2017, Bobillo2016}. As a measure for performance on unseen data, this suggests that the proposed algorithm generalises to a similar degree. However, it is clear that algorithms generally score poorly in this area. This is the general consensus across many of the algorithms presented for the challenge and is a problem that requires further work. Higher scores in 10-fold cross validation than those of Leave-one-out cross-validation further suggest that the algorithm is highly susceptible to degraded results, most likely as a consequence of signal qualities varying from those of the training set.\\ In addition, scores were also applied to the algorithm after balancing the full database. The aim of this was to remove class imbalance across the training and test set, to gain an understanding of how the model performs on each class equally. Results of these tests can be viewed in Appendix~\ref{appendixC}. It was found that, although hidden test set and 10-fold cross-validation scores aren't affected by class imbalance, there is a significant increase in the overall leave-one-out cross-validation score from 54.16\% to 66.13\%. This is currently thought to be caused by the model not resampling by database during training. As resampling during training does not maintain the balance between datasets, a disproportionate number of normal samples from larger datasets is likely to be present in the training set. This may cause the model to overfit this quality of sample recording, resulting in poor classification on the left out test set. It should however be noted that the significant reduction in samples caused by the resampling makes fitting a model that generalises to other databases easier, thus generating inflated results. However, when comparing the balanced database results to those of Kay et al., overall performance is still found to be significantly better than that of their proposed algorithm, despite use of a larger balanced dataset~\parencite{Kay2017}. This is an area that requires further investigation, and could potentially improve results of the algorithm considerably. A further issue that has been highlighted throughout previous literature, is the use of different recording equipment for the recording of normal and pathological signals in database $e$~\parencite{Kay2017, Bobillo2016}. It is suggested that this may have a significant impact on results, as an algorithm may inadvertently learn the difference in noise characteristics of signals, rather than of the heart sounds themselves. This is of particular importance as database $e$ is the largest available dataset. This issue is found in results through examination of database $e$ in participant's leave-one-out cross-validation scores. This may also be true in the case of the proposed system as database $e$ has shown considerably higher specificity in results than those the other database, both in balanced and unbalanced datasets. Further would be needed to understand the extent of the effect that this has on the performance of the proposed system.\\ The final 10-fold cross-validation score was found to be between, 2 and 12\% less than those of the highest scoring models~\parencite{Zabihi2016, Homsi2017, Kay2017}. This suggests that, when presented with signals similar to those presented in training, the proposed model is not able to find distinguishable patterns between healthy and pathological signal to the same degree as the leading algorithms in the field. However, accuracy scores of 80\% show promise for this method. Sensitivity and specificity scores are roughly equal, suggesting little bias between normal and abnormal classes. The method also performed equally well on the hidden test set. The hidden test set score is the only score based on predictions where no samples had previously been seen by the algorithm during optimisation. A similar score to that of 10-fold cross validation suggests that chosen features and hyperparameters generalise well to unseen data. If scores in cross validation had been significantly higher than that of the hidden test set, it would suggest that the model is tuning parameters and features in a way that only benefits the score of the training set. These scores suggest that the model is reasonably robust, and that with improvements detailed in section~\ref{FutureWork} there is considerable potential for this method.\\ Some of the most successful algorithms prior to the challenge also achieve similar levels of success (As detailed in Table~\ref{PriorWorkTable}, although these methods were evaluated on different datasets, so are not as directly comparable)~\parencite{Ari2010, Quiceno-Manrique2010a, Maglogiannis2009}. This clearly shows that high levels of accuracy are attainable through machine learning methods for PCG classification. In addition these methods also venture further into the area of classification for specific pathologies, a topic not widely considered for the challenge. \section{Discussion and further work}\label{FutureWork} The current implementation of the system has provided promising results, suggesting that the combination of techniques is well suited to the task of abnormality detection. It is clear however, that further development of the system could improve results further. This section defines some of the recognised issues that could be addressed in each of the system's components, in order to build on the work of the project in it's current form. Areas of interest are also outlined to suggest potential avenues for building on the system's current functionality.\\ A potential flaw in the current project is that beyond segmentation of the original signal, pre-processing (and other components of the system) currently make little use of biomedical domain knowledge to aid in processing of the input data. This is largely due to the author's lack of background in this area, prior to development of this project. An example of a project that has implemented this is the work by Goda et al.\ who, by recognising that trained professionals can classify most heart conditions, given at least 5 seconds of audio, was able to further segment audio in 5 second overlapping segments, essentially providing additional atomic samples for training~\parencite{Goda2016}. It is thought that other such assumptions based on physiological understanding could be made in order to further enhance the system.\\ Another area that requires further attention is the implementation of the resampling method, as discussed in Section~\ref{Eval}. It has been shown that there is a significant possibility for improving leave-one-out cross-validation scores by implementing resampling per-database. In terms of effort, this would be a relatively simple implementation and was not included in the project's current form primarily due to comprehension of the issue late in the project. It is hoped that this would produce considerable improvements in the generalisation performance of the project. In addition to this, resampling could conceivably be integrated in the hyperparameter optimisation method, in a similar fashion to the models themselves, allowing for selection of the resampling methods dynamically. This has the potential to improve accuracy of the classifier by providing an optimal selection of training data for the task.\\ The combination of hyperparameter and model optimisation was regarded as one of the key reasons for it's success. The current method, although sufficient for the task, may be further improved through deeper understanding of the models, the interaction of the models in stacking classifiers, and the parameter optimisation method.\\ For example, in the final selection of models, a linear SVM, RBF kernel SVM and Naive Bayes models were chosen by the system. From intuition it is thought that the reason these worked well is due to the complex combination of linear and non-linear relationships in the input features. As the RBF kernel is well suited to differentiating non linear patterns, and the linear SVM is well suited for linear patterns, these models would in theory compliment one another. This is also true of the Naive-bayes model, which considers each feature in isolation from all others, contrasting the complex inter-feature relationships (such as those most likely present in the MFCC and wavelet coefficients, for example) considered by the other models. Given this intuition, more targeted models could be provided to the dictionary for model selection, allowing for more efficient convergence of the optimisaton algorithm. This theory is however, simply conjecture at the current stage, and would require further research to form a better understanding of the problem.\\ Building on this, it is known that a wide variety of optimisation methods, such as genetic algorithms and swarm algorithms~\parencite[p.32]{Sesmero2015}. Deeper understanding of these may yeild a more suitable optimisation method than the tradition particle swarm method used in the current implementation.\\ The features in the current project clearly provided a reasonable representation of the signal, suitable for the classification task. This was largely due to the selective procurement of these features from literature, where they had been demonstrated to provide useful representations of audio both for PCG signals and in a more general context. Further analysis of individual features and the feature set as a whole may allow for a more refined feature set to be developed. This is in contrast to the current methodology which largely relies on the trial and error approach of SFFS. A deeper understanding may allow for more complex and context specific features to be added to the feature set, and for other ineffective features to be removed.\\ An example of this is the implementation of MFCC features. Although the current MFCCs were seen to be relevant features, the coefficients are used directly in a rather simplistic fashion, without any further processing. Other implementations have shown that further processing could be used to augment these features~\parencite{Zabihi2016}, providing additional potentially relevant statistical data to the feature-set.\\ An application of the system that has not been considered in any great depth is the further suitability of the method for more specific cardiovascular classification tasks. A number of projects prior to the challenge have demonstrated the success of methods similar to the proposed method in this area~\parencite{Ari2010, Quiceno-Manrique2010a}. The work of Ari et al.\ is of particular interest, as it demonstrated a reasonable general classifier using similar SVM based classifiers to that of the current project, but also aimed to further classify the signal using subsequent nested classifiers. This method could be applied to potentially develop a highly useful diagnostic tool.\\ Given the quality of results presented, it is believed that this project has successfully demonstrated the potential for heuristically optimised stacking classifiers to detect pathological heart sounds. With additional improvements, it is thought that the proposed methodology could be built upon, potentially providing an innovative alternative solution to the currently used methods for heart sound analysis. \pagebreak \appendix \section*{Appendices} \addcontentsline{toc}{section}{Appendices} \renewcommand{\thesubsection}{\Alph{subsection}} \subsection{Table of Features}\label{appendixA} \begin{table}[H] \centering \caption{Description of features} \onehalfspacing \tiny \label{my-label} \begin{tabulary}{\linewidth}{LLLLL} \hline Tag & Feature Name & Description \\ \hline heartRate & Heart Rate & The number of beats per minute (BPM) \\ m\_RR & Mean RR Interval & Average length of heart cycles \\ sd\_RR & Std-dev RR Interval & Standard deviation of heart cycles \\ mean\_Int* & Mean Segment Interval & Average length of segment \\ sd\_Int* & Std-dev Segment Interval & Standard deviation of segment \\ R\_SysRR & Systolic-RR Ratio & Ratio of Systolic interval to RR interval \\ R\_DiaRR & Diastolic-RR Ratio & Ratio of Diastolic interval to RR interval \\ R\_SysDia & Ratio of Systolic-RR/Diastolic-RR Ratios & Ratio of above ratios \\ *ZeroX & Zero-crossing & Zero-crossing rate of a segment \\ *RMS & Root Mean Square & The Root Mean Square of a segment \\ *ShanEngy & Shannon Energy & The Averaged Shannon Energy Envelope of a segment \\ *Dur & Duration & The duration of a segment \\ *Max & Max & The peak value of an absolute segment \\ *Mean & Mean & The mean value of a segment \\ *Skew & Skewness & The temporal skewness of a segment \\ *Kurt & Kurtosis & The temporal kurosis of a segment \\ *Var & Variance & The variance of a segment \\ *SampEnt & Sample Entropy & The sample entropy of a segment \\ *ShanEnt & Shannon Entropy & The Shannon Entropy of a segment \\ TPT* & Total Power (time) & The total power of a segment in the time domain \\ TPF* & Total Power (frequency) & The total power of a segment in the frequency domain \\ *Flat & Spectral Flatness & The flatness of a segment's frequency spectrum \\ *Cent & Spectral Centroid & The centroid of a segment's frequency spectrum \\ *Spread & Spectral Spread & The spread of a segment's frequency spectrum \\ *MFCC\{$n$\} & Mel-frequency Cepstrum Coefficients & MFCC coefficient number $n$, where $n = \{1, \ldots, 13\}$ \\ m\_Ratio\_SysRR & Mean RR/Systole interval ratio & Mean value of the interval ratios between systole and RR in each heart beat \\ sd\_Ratio\_SysRR & Std-dev RR/Systole interval ratio & Std-dev value of the interval ratios between systole and RR in each heart beat \\ m\_Ratio\_DiaRR & Mean RR/Diastole interval ratio & Mean value of the interval ratios between diastole and RR in each heart beat \\ sd\_Ratio\_DiaRR & Std-dev RR/Diastole interval ratio & Std-dev value of the interval ratios between diastole and RR in each heart beat \\ m\_Ratio\_SysDia & Mean Systole/Diastole interval ratio & Mean value of the interval ratios between systole and diastole in each heart beat \\ sd\_Ratio\_SysDia & Std-dev Systole/Diastole interval ratio & Std-dev value of the interval ratios between systole and diastole in each heart beat \\ D[1-5]Shan & Detail coefficient shannon entropy & Shannon entropy of DWT detail coefficient 1-5 \\ A5Shan & Approximation coefficient shannon entropy & Shannon entropy of DWT approximation coefficient 1-5 \\ \mbox{TotD[1-5]*Shan} & Total detail coefficient shannon entropy & Total Shannon entropy of DWT detail coefficient 1-5 across signal \\ TotA5*Shan & Total approximation coefficient shannon entropy & Total Shannon entropy of DWT approximation coefficient 1-5 across signal \\ \hline \end{tabulary} \justifying \scriptsize Feature sources include:~\parencite{Homsi2016, Schmidt2015, Liang1998, Lerch2012}\\ `*' --- denotes feature is applied to S1, systolic, S2 and diastolic segments respectively. \end{table} \pagebreak \subsection{Commandline Interface}\label{appendixB} \singlespacing \lstset{basicstyle=\tiny, style=mystyle} \begin{lstlisting}[numbers=none] usage: main.py [-h] [--features-fname OUTFNAME] [--segment] [--optimize] [--eval EVAL] [--select-features SELECT\_FEATURES] [--backward] [--parameters_fname OUTFNAME] [--fs_fname OUTFNAME] [--no-parallel] [--reanalyse] [--verbose] [--resample-mix RESAMPLE_MIX] [--keep-logs] TESTDIR OUTDIR Script for the classification of PCG data. positional arguments: TESTDIR Directory of test data to train the system OUTDIR Directory to store output analyses optional arguments: -h, --help show this help message and exit --features-fname, -o Specify the name of the file to save generated features to for future use --segment Run Matlab segmentation script to create segmentation analysis --optimize Run optimization algorithm to find best model and parameters for classifier --eval, -e Number of evaluation to pass to the particle swarm optimization --select-features Run feature selection algorithm to find best features for model, either selecting or reducing features by the integer specified. This depends on use of --backward flag, to determine forward or backward feature selection. (a value of 0 skips feature selection entirely, using previously generated features if available. A value less than 0 uses all available features.) --backward, -b Runs backward feature selection as opposed to default forward selection. --parameters_fname Specify the name of the file to save generated features to for future use --fs_fname Specify the name of the file to save generated feature selection model to for future use --no-parallel, -p Disable processing in parallel. (Will likely decrease performance but may aid in debugging) --reanalyse Force regeneration of database features --verbose, -v Specifies level of verbosity in output. For example: '-vvvvv' will output all information. '-v' will output minimal information. --keep-logs Keep previously generated logs that aren't overwritten by current process \end{lstlisting} \doublespacing \pagebreak{} \subsection{Final results}\label{appendixD} Results of of tests on final optimised model\\ Leave-one-out scores are shown in Table~\ref{LOGO}\\ Stratified cross-validation scores can be found in Table~\ref{KFCV}\\ \begin{table}[H] \doublespacing \caption{Leave-one-out scores} \label{LOGO} \footnotesize All scores are an average of 10 iterations $\pm$ standard-deviation \scriptsize \centering \begin{tabulary}{\linewidth}{LCCCCCCC} \toprule & A & B & C & D & E & F & Mean \\ \midrule $Acc$ & $0.5395\pm0.0104$ & $0.4896\pm0.0129$ & $0.5673\pm0.0298$ & $0.5173\pm0.0223$ & $0.5869\pm0.0300$ & $0.5492\pm0.0140$ & $0.5416\pm0.0318$ \\ $Se$ & $0.7281\pm0.0164$ & $0.8664\pm0.0240$ & $0.6775\pm0.0208$ & $0.7865\pm0.0218$ & $0.5397\pm0.0459$ & $0.7387\pm0.0493$ & $0.7228\pm0.1005$ \\ $Sp$ & $0.3509\pm0.0264$ & $0.1127\pm0.012$ & $0.4571\pm0.0571$ & $0.2481\pm0.0416$ & $0.6340\pm0.0387$ & $0.3596\pm0.0464$ & $0.3604\pm0.1624$ \\ \bottomrule \end{tabulary} \end{table} \begin{table}[H] \caption{10-fold cross-validation score} \footnotesize All scores are an average of 10 iterations $\pm$ standard-deviation \doublespacing \label{KFCV} \scriptsize \centering \begin{tabulary}{\linewidth}{LCCCCCCCCCCC} \toprule & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10 & Mean \\ \midrule $Acc$ & $0.7969\pm0.0246$ & $0.8049\pm0.0244$ & $0.8043\pm0.0153$ & $0.8111\pm0.0295$ & $0.8095\pm0.0261$ & $0.7999\pm0.0208$ & $0.8061\pm0.0299$ & $0.8150\pm0.0198$ & $0.8140\pm0.0245$ & $0.7928\pm0.0224$ & $0.8055\pm0.0069$ \\ $Se$ & $0.8121\pm0.0420$ & $0.8164\pm0.0360$ & $0.8193\pm0.0302$ & $0.8184\pm0.0634$ & $0.8158\pm0.0484$ & $0.8061\pm0.0438$ & $0.8325\pm0.0546$ & $0.8421\pm0.0321$ & $0.8246\pm0.0474$ & $0.7798\pm0.0302$ & $0.8167\pm0.0157$ \\ $Sp$ & $0.7818\pm0.0293$ & $0.7935\pm0.0267$ & $0.7894\pm0.0208$ & $0.8037\pm0.0280$ & $0.8033\pm0.0226$ & $0.7937\pm0.0214$ & $0.7798\pm0.0229$ & $0.7878\pm0.0206$ & $0.8035\pm0.0219$ & $0.8059\pm0.0228$ & $0.7942\pm0.0091$ \\ \bottomrule \end{tabulary} \end{table} \pagebreak \subsection{Balanced dataset test results}\label{appendixC} Results of testing database using a resampled, balanced dataset.\\ Dataset was resampled by database, using jacknife resampling (Sampling without replacement) and consisted of a total of 944 samples. \begin{table}[H] \centering \caption{Hidden test-set scoring} \begin{tabular}{@{}lll@{}} \toprule $Acc$ & $Se$ & $Sp$ \\ \midrule 80.77\% & 79.41\% & 82.14\% \\ \bottomrule \end{tabular} \end{table} \begin{table}[H] \doublespacing \caption{Leave-one-out scores} \footnotesize All scores are an average of 10 iterations $\pm$ standard-deviation \scriptsize \centering \begin{tabulary}{\linewidth}{LCCCCCCC} \toprule & A & B & C & D & E & F & Mean \\ \midrule $Acc$ & $0.5784\pm0.0153$ & $0.6062\pm0.0131$ & $0.8737\pm0.0131$ & $0.5939\pm0.0302$ & $0.7022\pm0.0136$ & $0.6130\pm0.0189$ & $0.6613\pm0.1029$ \\ $Se$ & $0.4984\pm0.0225$ & $0.8816\pm0.0000$ & $0.7475\pm0.0261$ & $0.6692\pm0.0319$ & $0.6417\pm0.0227$ & $0.7290\pm0.0636$ & $0.6946\pm0.1161$ \\ $Sp$ & $0.6585\pm0.0342$ & $0.3309\pm0.0262$ & $1.0000\pm0.0000$ & $0.5185\pm0.0741$ & $0.7628\pm0.0134$ & $0.4971\pm0.0509$ & $0.6280\pm0.2141$ \\ \bottomrule \end{tabulary} \end{table} \begin{table}[H] \caption{10-fold cross-validation score} \footnotesize All scores are an average of 10 iterations $\pm$ standard-deviation \doublespacing \scriptsize \centering \begin{tabulary}{\linewidth}{LCCCCCCCCCCC} \toprule & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10 & Mean \\ \midrule $Acc$ & $0.7961\pm0.0364$ & $0.8187\pm0.0384$ & $0.8108\pm0.0238$ & $0.8019\pm0.0316$ & $0.8103\pm0.0326$ & $0.8217\pm0.0417$ & $0.7845\pm0.0593$ & $0.8053\pm0.0262$ & $0.8023\pm0.0148$ & $0.8105\pm0.0312$ & $0.8062\pm0.0103$ \\ $Se$ & $0.7922\pm0.0603$ & $0.8060\pm0.0453$ & $0.7853\pm0.0363$ & $0.7772\pm0.0489$ & $0.8044\pm0.0504$ & $0.8070\pm0.0538$ & $0.7553\pm0.0700$ & $0.7623\pm0.0312$ & $0.7772\pm0.0272$ & $0.8088\pm0.0461$ & $0.7876\pm0.0184$ \\ $Sp$ & $0.8000\pm0.0360$ & $0.8315\pm0.0504$ & $0.8363\pm0.0289$ & $0.8266\pm0.0366$ & $0.8161\pm0.0351$ & $0.8363\pm0.0627$ & $0.8137\pm0.0578$ & $0.8484\pm0.0380$ & $0.8274\pm0.0284$ & $0.8123\pm0.0412$ & $0.8249\pm0.0136$ \\ \bottomrule \end{tabulary} \end{table} \pagebreak \printbibliography{} \end{document}