1999 lines
120 KiB
TeX
1999 lines
120 KiB
TeX
\documentclass[titlepage, 12pt]{scrartcl} \usepackage{enumitem}
|
||
|
||
\usepackage[british]{babel}
|
||
\usepackage[style=apa, backend=biber]{biblatex}
|
||
\DeclareLanguageMapping{british}{british-apa}
|
||
\usepackage{url}
|
||
\usepackage{float}
|
||
\usepackage{ragged2e}
|
||
\usepackage{caption}
|
||
\usepackage{multicol}
|
||
\newcommand{\tabitem}{~~\llap{\textbullet}~~}
|
||
%\restylefloat{table}
|
||
\usepackage[table]{xcolor}
|
||
\usepackage{multirow}
|
||
\usepackage{perpage}
|
||
\MakePerPage{footnote}
|
||
\usepackage{abstract}
|
||
\usepackage{graphicx}
|
||
\usepackage{setspace}
|
||
% Create hyperlinks in bibliography
|
||
\usepackage{hyperref}
|
||
\usepackage{amsmath}
|
||
\usepackage{booktabs}
|
||
\usepackage{tabulary}
|
||
|
||
\usepackage[pass]{geometry}
|
||
\usepackage{pdflscape}
|
||
\usepackage{graphicx}
|
||
|
||
\usepackage[T1]{fontenc}
|
||
\usepackage[utf8]{inputenc}
|
||
\usepackage{blindtext}
|
||
\setkomafont{disposition}{\normalfont\bfseries}
|
||
|
||
\usepackage{etoolbox}
|
||
|
||
\usepackage{titlesec}
|
||
\setcounter{secnumdepth}{4}
|
||
|
||
\titleformat{\paragraph}
|
||
{\normalfont\normalsize\bfseries}{\theparagraph}{1em}{}
|
||
\titlespacing*{\paragraph}
|
||
{0pt}{3.25ex plus 1ex minus .2ex}{1.5ex plus .2ex}
|
||
|
||
\graphicspath{{./resources/}}
|
||
\addbibresource{~/Documents/library.bib}
|
||
|
||
\DeclareMathOperator*{\argmax}{arg\,max}
|
||
\DeclareMathOperator*{\argmin}{arg\,min}
|
||
% Fix for medeley's rubbish underscore handeling in generated bib files
|
||
\DeclareSourcemap{
|
||
\maps{
|
||
\map{ % Replaces '{\_}', '{_}' or '\_' with just '_'
|
||
\step[fieldsource=url,
|
||
match=\regexp{\{\\\_\}|\{\_\}|\\\_},
|
||
replace=\regexp{\_}]
|
||
}
|
||
\map{ % Replaces '{'$\sim$'}', '$\sim$' or '{~}' with just '~'
|
||
\step[fieldsource=url,
|
||
match=\regexp{\{\$\\sim\$\}|\{\~\}|\$\\sim\$},
|
||
replace=\regexp{\~}]
|
||
}
|
||
\map{ % Replaces '{\_}', '{_}' or '\_' with just '_'
|
||
\step[fieldsource=url,
|
||
match=\regexp{\{\\\#\}|\{\#\}|\\\#},
|
||
replace=\regexp{\#}]
|
||
}
|
||
}
|
||
}
|
||
|
||
%\newsavebox{\abstractbox}
|
||
%\renewenvironment{abstract}
|
||
% {\begin{lrbox}{0}\begin{minipage}{\textwidth}
|
||
% \begin{center}\normalfont\sectfont\abstractname\end{center}\quotation}
|
||
% {\endquotation\end{minipage}\end{lrbox}%
|
||
% \global\setbox\abstractbox=\box0 }
|
||
|
||
%\makeatletter
|
||
%\expandafter\patchcmd\csname\string\maketitle\endcsname
|
||
% {\vskip\z@\@plus3fill}
|
||
% {\vskip\z@\@plus2fill\box\abstractbox\vskip\z@\@plus1fill}
|
||
% {}{}
|
||
%\makeatother
|
||
|
||
\newcommand{\dtoprule}{\specialrule{1pt}{0pt}{1.4pt}%
|
||
\specialrule{1pt}{0pt}{\belowrulesep}%
|
||
}
|
||
\newcommand{\dbottomrule}{\specialrule{1pt}{0pt}{1.4pt}%
|
||
\specialrule{1pt}{0pt}{\belowrulesep}%
|
||
}
|
||
|
||
\DeclareCiteCommand{\citeyearpar}
|
||
{}
|
||
{\mkbibparens{\bibhyperref{\printdate}}}
|
||
{\multicitedelim}
|
||
{}
|
||
|
||
% MATLAB Code block stuff...
|
||
\usepackage{color}
|
||
\usepackage{listings}
|
||
|
||
\definecolor{dkgreen}{rgb}{0,0.6,0}
|
||
\definecolor{gray}{rgb}{0.5,0.5,0.5}
|
||
|
||
\lstset{language=Matlab,
|
||
keywords={break,case,catch,continue,else,elseif,end,for,function,
|
||
global,if,otherwise,persistent,return,switch,try,while},
|
||
basicstyle=\ttfamily,
|
||
keywordstyle=\color{blue},
|
||
commentstyle=\color{gray},
|
||
stringstyle=\color{dkgreen},
|
||
numbers=left,
|
||
numberstyle=\tiny\color{gray},
|
||
stepnumber=1,
|
||
numbersep=10pt,
|
||
backgroundcolor=\color{white},
|
||
tabsize=4,
|
||
showspaces=false,
|
||
showstringspaces=false}
|
||
\usepackage[shortcuts]{extdash}
|
||
|
||
\definecolor{codegreen}{rgb}{0,0.6,0}
|
||
\definecolor{codegray}{rgb}{0.5,0.5,0.5}
|
||
\definecolor{codepurple}{rgb}{0.58,0,0.82}
|
||
\definecolor{backcolour}{rgb}{0.95,0.95,0.92}
|
||
|
||
\lstdefinestyle{mystyle}{
|
||
keywords={},
|
||
numberstyle=\tiny,
|
||
basicstyle=\scriptsize,
|
||
breakatwhitespace=false,
|
||
breaklines=false,
|
||
captionpos=b,
|
||
keepspaces=true,
|
||
numbers=left,
|
||
numbersep=5pt,
|
||
showspaces=false,
|
||
showstringspaces=false,
|
||
showtabs=false,
|
||
tabsize=2
|
||
}
|
||
\lstset{style=mystyle}
|
||
|
||
\begin{document}
|
||
\newgeometry{lmargin=1.5cm}
|
||
\begin{titlepage}
|
||
|
||
\begingroup
|
||
|
||
\setlength{\tabcolsep}{1.5cm}
|
||
|
||
\begin{tabular}[c]{p{0.30\textwidth} | p{0.4\textwidth}}
|
||
|
||
{\vspace{1.2cm} \Large School of Electronic Engineering and Computer Science \par}
|
||
&
|
||
{\vspace{1.2cm} \large Sound and Music Computing \newline Project Report \the\year \par}\\
|
||
|
||
& {\vspace{0.5cm} \Large \textbf{Extraction of Audio Features from PCG Signals for the
|
||
Classification of Heart Abnormalities} \par}\\
|
||
|
||
\vspace{0.4\textheight}
|
||
\includegraphics[width=5cm]{qmul_logo}
|
||
&
|
||
{\vspace{1cm} \large \textbf{Samuel Perry}}\\
|
||
|
||
&
|
||
\multicolumn{1}{|r}{August \the\year}
|
||
|
||
\end{tabular}
|
||
|
||
\endgroup
|
||
|
||
\end{titlepage}
|
||
\restoregeometry
|
||
|
||
\doublespacing
|
||
\begin{abstract}
|
||
A system is presented for the analysis and classification of pathological
|
||
phonocardiogram signals. The project aims to provide a reliable and robust
|
||
detection method to aid in the early detection of cardiovascular
|
||
abnormalities. The project utilises a novel combination of feature
|
||
extraction, optimisation and machine learning techniques to produce
|
||
accurate results, demonstrating the clear potential for this methodology in
|
||
the area of heart abnormality detection.
|
||
\end{abstract}
|
||
|
||
\renewcommand{\abstractname}{Acknowledgements}
|
||
\begin{abstract}
|
||
I would like to thank my supervisor, Dr.\ Tony Stockman, for his guidance and
|
||
support throughout the project. I am also extremely grateful for the advice
|
||
given by Robert Blaauboer. Finally, I would also like to thank John Thompson,
|
||
Tom Pengelly and David Pengelly for their valuable support.
|
||
\end{abstract}
|
||
|
||
\tableofcontents
|
||
\newpage
|
||
|
||
\section{Introduction}
|
||
Cardiovascular diseases are the most prevalent cause of death in Europe,
|
||
accounting for 37.5\% of all fatalities in 2013~\parencite{Eurostat2016}.
|
||
Traditionally, cardiac auscultation has been performed manually using a standard
|
||
stethoscope, with the aim of detecting heart defects aurally. This has been a
|
||
fundamental method for detecting heart valve disorders for over a century.
|
||
However, auscultation is a skill that requires training and can only usually be
|
||
performed by a medical professional, such as a GP. As a result, manual
|
||
auscultation is significantly susceptible to human error~\parencite{Hanna2002}.
|
||
Automation of this method using technology may provide a solution, and
|
||
recent research has shown promise in this area. A large amount of research has
|
||
focused on analysis of Electrocardiogram (ECG) signals. Although useful for
|
||
detecting pathologies, ECG equipment is expensive and requires a trained
|
||
professional for use. Therefore it is not currently feasible for developing
|
||
countries and rural areas where there may be few physicians available. A
|
||
comparatively affordable and non-invasive alternative is the Phonocardiogram
|
||
(PCG)~\parencite[p.130]{Reed2004}. Typically recorded using an electronic
|
||
stethoscope, a PCG signal is a recording of sound made as the heart contracts,
|
||
analogous to the sound heard by physicians when performing cardiac auscultation
|
||
manually. Automated auscultation could provide an initial diagnosis for heart
|
||
defects without the need for a trained medical professional. This would allow
|
||
relatively cheap equipment to analyse a patient's heart sound, and
|
||
automatically recommend further inspection based on analysis. By providing
|
||
earlier diagnosis of conditions that may have otherwise been overlooked, this
|
||
technology could have a significant impact on reducing mortality rates as a
|
||
result of heart conditions.
|
||
|
||
\section{Related work}
|
||
There are currently a wide variety of methods employed for the analysis and
|
||
classification of PCG signals. Current methods can typically be divided into 3
|
||
areas, each of which are combined to create a full classification system. These
|
||
areas are: signal preprocessing, signal segmentation, and feature
|
||
extraction/classification. The performance and evaluation of complete systems
|
||
are also discussed in section~\ref{Classification}
|
||
|
||
|
||
\subsection{Signal preprocessing}
|
||
There are a large number of factors that lead to variation in quality of PCG
|
||
recordings: stethoscope type, make and model, its microphone/sensors, the
|
||
position used to record (i.e.\ lower left sternal border, apex, pulmonic area,
|
||
aortic area), built in filters/signal processing used by the stethoscope (i.e.\
|
||
noise filters, anti-tremor filters), medication that a patient may be taking,
|
||
as well as many other aspects that may influence the recorded
|
||
signal~\parencite[p.4]{Pavlopoulos2004}. This presents a significant issue when
|
||
attempting to analyse and compare a database of signals, as variations in
|
||
recordings and artefacts caused by factors other than heart sounds will most
|
||
likely interfere with analysis and comparison methods. To account for this,
|
||
pre-processing methods are widely used, aiming to standardise a database. This
|
||
is also used as a way to accentuate features of the data that are expected to
|
||
be relevant for classification.\\
|
||
|
||
A common method employed is the use of decimation and static filters to remove
|
||
unwanted spectral content that is most likely noise~\parencite{Liang1997a,
|
||
Homsi2016, Springer2016, Gupta2007}. This helps reduce higher frequency noise
|
||
such as speech, microphone movement, breathing and other interference caused
|
||
externally. Signals are commonly downsampled to around 1--4KHz, with
|
||
anti-aliasing filter specifications varying across the literature. Generally,
|
||
highpass chebychev or butterworth filters are favoured with cutoff frequencies
|
||
ranging from 400--750Hz.\\
|
||
|
||
In addition, many methods decompose the filtered signal using wavelet based
|
||
methods, such as the discrete wavelet transform (DWT)~\parencite{Liang1997a,
|
||
Pavlopoulos2004}, continuous wavelet transform (CWT)~\parencite{Langley2016} or
|
||
wavelet package decomposition (WPD)~\parencite{Liang1998}. These methods are
|
||
commonly used to separate components of a signal based on their spectral
|
||
content. Wavelet transforms are popular as, unlike Fourier transforms, they
|
||
are well localised in both the time and frequency domain. This allows for the
|
||
analysis of PCG signals across multiple frequency bands whilst maintaining
|
||
transient temporal events in the resulting
|
||
decomposition~\parencite[p.93]{Ari2008}. This may be used for analysis of
|
||
transient events such as murmurs, that may consist of higher frequency
|
||
components than normal heart sounds.
|
||
|
||
\subsection{Signal segmentation}\label{Segmentation}
|
||
Algorithms for the segmentation of PCG data aim to extract the structure of the
|
||
signal over time. This is a key stage in the analysis of PCG signals, as the
|
||
structure of the signal and relationships between the fundamental heart sounds
|
||
(FHSs) form the basis for much of further analysis performed on PCG data.\\
|
||
|
||
% TODO: insert segmented graph of PCG cycle
|
||
|
||
A number of methods exist for the extraction of FHSs. Traditional methods rely
|
||
on direct extraction of peaks from amplitude envelopes in the time domain to
|
||
determine the structure of a signal. These methods perform various
|
||
processing/transformations in order to accentuate the transient events with the
|
||
intention of isolating them.\\
|
||
Early work in this area by Liang et.\ al described a method using the popular
|
||
Shannon energy envelope, achieving good accuracy across 37 recordings of
|
||
children~\parencite{Liang1997b}. The algorithm aimed to segment the data by
|
||
first extracting the envelope, then applying adaptive rule based thresholds, to
|
||
determine peaks corresponding to segmentation points. When comparing results to
|
||
hand-annotated ground truth data, the system achieved a reported accuracy score
|
||
of 84\%. However, due to the small sample size, and potential lack of noise in
|
||
the database used, this may not translate well to a larger database recorded in
|
||
sub-optimal conditions.\\
|
||
More recent methods used spectral representations to assist in the splitting of
|
||
the FHSs, in particular using wavelet decomposition. These methods tend to
|
||
perform more robustly on signals of varying conditions.\\
|
||
Building on previous work, Liang et.\ al presented an improved method, using the
|
||
discrete wavelet transform to decompose and reconstruct the signal into 7
|
||
distinct frequency bands~\parencite{Liang1997a}. Applying a similar method
|
||
of envelope extraction and peak picking to each frequency band, the best
|
||
estimate of all frequency bands is then chosen as the final result. Criterion
|
||
for this choice is based on the number of S1s and S2s detected, and the number
|
||
of artefacts discarded for each frequency band. This method achieved an
|
||
improved accuracy of 93\% across a larger database of 77 recordings. This
|
||
suggests that the algorithm is as robust if not more so than previous work by
|
||
Liang et al.\\
|
||
|
||
Vepa et al.\ proposed a wavelet decomposition based method that uses a
|
||
combination of simplicity and envelope features~\parencite{Vepa2008}. This
|
||
approach attempts to improve robustness when analysing signals of varying
|
||
quality by using multiple complimentary features. This allows the method to base
|
||
decisions on a variety of statistical properties. Evaluating the algorithm on a
|
||
collection of 160 heart cycles from a variety of sources, a reported accuracy
|
||
of 84\% was achieved.\\
|
||
|
||
More recently, a variety of machine learning methods have been implemented with
|
||
reasonable success. Gupta et al.\ presented a method that applies $k$-means
|
||
clustering to replace standard threshold-based methods for determining peak
|
||
classification in a segmentation algorithm based on standard
|
||
envelopes~\parencite{Gupta2007}. This achieved a reported accuracy of 90.29\%.
|
||
Due to the envelope-based method for feature extraction, this method is still
|
||
susceptible to noise and artefacts that occur within the frequency bands of the
|
||
heart sounds.\\
|
||
|
||
Sepehri et al.\ proposed a method that combines neural networks with Power
|
||
Spectral Density (PSD) estimates~\parencite{Sepehri2010}. By exploiting the
|
||
periodic nature of S1 and S2 heart sounds, combined with their narrow frequency
|
||
range, a neural network is trained to separate these sounds from other events,
|
||
such as noise and murmurs. This method achieved a reported 93.6\% accuracy on a
|
||
significantly larger database than previous methods detailed.\\
|
||
|
||
The most significant successes in segmentation algorithms have been observed through use
|
||
of probabilistic models such as Hidden Markov Models (HMMs). Early research
|
||
using these models by Ricke et al.\ utilised embedded HMMs to model the 4
|
||
states of the PCG and their transitions~\parencite{Ricke2005}. MFCCs and
|
||
Shannon Energy were used as feature vectors for the models. Results of
|
||
98\% accuracy were reported, although this was tested on only a small database
|
||
of signals.\\
|
||
Gill et al.\ achieved similar results, most notably with specific consideration
|
||
for the duration of each state in the HMM~\parencite{Gill2005}. This is
|
||
handled through the extraction of 6 duration features based primarily on peaks.
|
||
These features form vectors for training the HMM. Results of 98.6\%
|
||
sensitivity, 96.9\% positive predictivity for S1 sounds and 98.3\% sensitivity,
|
||
96.5\% positive predictivity for S2 sounds is reported.
|
||
The issue of state duration was further addressed by Schmidt et.\ al through use
|
||
of a Duration-dependent Hidden Markov Model (DHMM)~\parencite{Schmidt2015}. The
|
||
DHMM is a modified HMM that considers the duration of the current state when
|
||
calculating the probability of transition to another state. This modification
|
||
scored a reported sensitivity of 98.8\% and a positive predictivity of
|
||
98.6\%.\\
|
||
Building on previous work using HMMs, Springer et al.\ presented a segmentation
|
||
algorithm by using Hidden Semi-Markov Models (HSMMs) in combination with
|
||
logistic regression~\parencite{Springer2016}. Use of Hidden semi markov model
|
||
allows for a priori information on the duration of the current state to be used
|
||
in probability calculation of the subsequent state. In this case, the knowledge
|
||
that there is an upper and lower limit on the duration of each component is
|
||
used in the calculation of transition probabilities. A modified viterbi algorithm
|
||
is then used to calculate the most likely set of transitions based on observed
|
||
features. Logistic regression is used to improve discrimination between state
|
||
features when compared to discriminatory methods used by previous work.
|
||
Performance was evaluated on a significantly larger database than previous
|
||
methods and achieved a reported accuracy of $95.63\% \pm 0.85\%$. Due to it's
|
||
rigorous evaluation and high accuracy, this method is currently considered the
|
||
state-of-the-art for PCG signal segmentation.\\
|
||
|
||
Table~\ref{SegmentationTable} provides a brief overview of significant research
|
||
into PCG segmentation. For a more complete summary of the current state of PCG
|
||
segmentation, please refer to Liu et.\ al~\parencite{Liu2016}
|
||
|
||
\newgeometry{margin=1cm} % modify this if you need even more space
|
||
\begin{landscape}
|
||
%\centering
|
||
\begin{table}[htbp]
|
||
\captionof{table}{Summary of Segmentation Algorithms} \label{SegmentationTable}
|
||
\scriptsize
|
||
\rowcolors{1}{gray!15}{white}
|
||
\doublespacing
|
||
\begin{tabulary}{\linewidth}{LLLLL}
|
||
\dtoprule
|
||
Author & Method & databases & \mbox{Reported} Results & Notes \\ \bottomrule
|
||
Springer et.\ al \citeyearpar{Springer2016} & HSMM, Logistic regression & 10,172s of recordings from 112 patients. 12,181 first and 11,627 second heart sounds. & $95.63\pm0.85\%$ & Supervised algorithm. \\
|
||
Huiying et.\ al \citeyearpar{Liang1997b} & Normalised average Shannon energy envelope, peak picking & 37 recordings, 14 pathological murmurs and 23 physiological murmurs. 515 cycles & $91.03\%\;Ac$ & Unsupervised Algorithm. database consists entirely of child recording. Optimized on full database \\
|
||
Vepa et.\ al \citeyearpar{Vepa2008} & Wavelet decomposition, energy and simplicity measurement & 160 heart cycles collected from a variety of sources (training CDs, web resources) & $84\%\;Ac$ & Unsupervised Algorithm, Optimized on full database \\
|
||
Sun et.\ al \citeyearpar{Sun2014} & Viola integral envelope extraction, short-time modified Hilbert transform, peak picking & 6949s of recordings, from 121 patients & $97.37\%\;Ac$ & Supervised algorithm. Tolerance for segmentation accuracy not specified \\
|
||
Sepehri et.\ al \citeyearpar{Sepehri2010} & Spectral density estimation, auto-regressive parameters, multi-layer perceptron neural network & 120 recording, from 60 patients & $93.6\%\;Ac$ & Supervised algorithm \\
|
||
Ricke et.\ al \citeyearpar{Ricke2005} & Shannon energy (and related features), HMM & 9 recordings, from 9 patients & $98\%\;Ac$ & Supervised algorithm \\
|
||
Schmidt et.\ al \citeyearpar{Schmidt2015} & DHMM, Auto-correlation duration features, Homomorphic envelogram & 113 recordings, from 113 patients. 8s per recording. 15 abnormal recordings & $98.8\;Se,\;98.6\;P_+$ on test set & All data recorded ``lateral to the sternum in the fourth intercostal space on the left side''. Mix of noisy and clean recordings. 40 recording used for training, 73 for testing \\
|
||
Gill et.\ al \citeyearpar{Gill2005} & Homomorphic envelogram, Embedded HMMs & 44 recording, 17 subjects. 30-60s per recording & $98.6\%\;Ac, 96.9\;P_+$ for S1. $98.3\;Ac,\;96.5\;P_+$ for S2 & Recording taken in sub-optimal environments (noisy hospitals, offices etc...) \\
|
||
Gupta et.\ al \citeyearpar{Gupta2007} & Homomorphic filtering, $k$-means clustering & 41 patients, 340 heart cycles. 110 normal, 124 systolic murmur, 106 diastolic murmur & $90.29\%\;Ac$ & Unsupervised Algorithm. \\ \hline
|
||
\dbottomrule\\
|
||
\end{tabulary}
|
||
$Ac = \text{Accuracy}, Se = \text{Sensitivity}, P_+ = \text{Positive predictivity}$
|
||
\end{table}
|
||
\end{landscape}
|
||
\restoregeometry
|
||
|
||
|
||
\doublespacing
|
||
|
||
\subsection{Feature extraction/classification models}\label{Classification}
|
||
|
||
A wide variety of methods exist for the extraction of statistical features and
|
||
classification of PCG data. Most notably, the range of methods that were
|
||
developed as a result of the recent Physionet/Computing in Cardiology (CinC)
|
||
Challenge 2016 have improved the quality of abnormality classification in
|
||
noisy signals. The challenge was assembled to provide researchers with a large
|
||
database of normal/pathological PCG signals of varying quality. This enabled
|
||
the development of algorithms that could be evaluated on a significant
|
||
database, in order to determine performance across a range of conditions/signal
|
||
qualities~\parencite{Clifford2016}. This section first details significant work
|
||
produced prior to the challenge, then highlights key works produced for the
|
||
challenge to outline the breadth of methods for robust heart sound analysis.
|
||
|
||
\subsubsection{Work prior to the Physionet challenge}
|
||
Work prior to the Physionet challenge was conducted predominantly with the aim
|
||
of classifying specific heart conditions. Until recently, little research had
|
||
been produced with regards to general abnormality detection, with many projects
|
||
choosing to focus on specific conditions such as murmurs, atrial fibrillation,
|
||
flutter, and heart valve disease. This section outlines some key research
|
||
into these areas, alongside initial research into general abnormality
|
||
detection.\\
|
||
|
||
Reed et al.\ implemented a simple general classification algorithm using
|
||
artificial neural networks (ANNs) and wavelet
|
||
decomposition~\parencite{Reed2004}. As initial work into this field,
|
||
preprocessing such as segmentation is not performed and features remain
|
||
relatively simple when compared to more recent methods. Also, due to the
|
||
comparatively small sample size used for training (1 patient per abnormality, 4
|
||
cycles per patient), a reported accuracy of 100\% would have a strong
|
||
possibility of generalising poorly. This does however, serve as an early
|
||
example of limited success in general heart sound classification.\\
|
||
|
||
Maglogiannis et al.\ presented a classifier for discrimination of heart valve
|
||
disease from regular heart sounds using an SVM (Support Vector Machine)
|
||
classifier~\parencite{Maglogiannis2009}. Roughly 100 features were extracted
|
||
from the signal, based on direct analysis of each heart cycle component (S1,
|
||
Systole, S2, Diastole) and the average Shannon energy envelope of these
|
||
components. A database of 198 heart sounds was curated for the project,
|
||
acquired from 8 sources, such as medical CDs and pre-existing databases. An
|
||
accuracy of 91.43\% was reported using 10-fold stratified cross-validation. In
|
||
addition, the project aimed to classify individual abnormalities in a 3 step
|
||
process, by distinguishing between systolic or diastolic murmurs, and then
|
||
distinguishing between aortic or mitral diseases. The classifier achieved
|
||
accuracy between 90-97\% for these classifications. This approach demonstrates
|
||
the potential for a system to accurately distinguish between normal and
|
||
abnormal heart sounds in a generalisable way, given carefully selected
|
||
features.\\
|
||
|
||
Ari et.\ al also proposed an SVM based method for abnormality
|
||
classification~\parencite{Ari2010}. A modified Least-squares SVM (LSSVM) is
|
||
used in order to improve separability between normal and abnormal datapoints
|
||
during training. 32 wavelet based features from previous literature are used as
|
||
feature vectors for a modified LSSVM, un-modified LSSVM and a standard SVM.
|
||
Comparison of the system shows that the proposed technique performs
|
||
significantly better on all test sets with an accuracy of between 86\% and
|
||
100\%, dependent on database. This highlights the importance of choosing an
|
||
appropriate classification method for achieving accurate results.\\
|
||
|
||
Quiceno-Manrique et al.\ demonstrate the use of various time frequency
|
||
representations (TFR) such as short-time Fourier transform, wavelet transforms,
|
||
Wigner-Ville distribution etc\ldots, with a $k$-nearest neighbour classifier
|
||
(k-NN) for systolic murmur detection~\parencite{Quiceno-Manrique2010a}. This
|
||
work highlights the effectiveness of alternative TFRs to traditional Fourier
|
||
methods. This method also employs Principle Component Analysis (PCA), ie.\ the
|
||
mapping of a high dimensional feature space to a lower dimension, in order to
|
||
improve computational performance. Features were evaluated using a database
|
||
of of 22 patients, 6 of which were labeled as having a systolic murmur. The
|
||
highest reported accuracy was achieved using MFCCs as the primary feature
|
||
vector achieving a 98\% accuracy on 10-fold cross validation.\\
|
||
|
||
Schmidt et al.\ aimed to find features that could be used for classification of
|
||
coronary artery disease through detection of small
|
||
murmurs~\parencite{Schmidt2015}. A large number of features are
|
||
calculated to provide vectors for classification; Parametric spectral features
|
||
such as ARMA are used, alongside instantaneous frequency and octave power
|
||
measurements. Complexity features, such as sample entropy and simplicity, are
|
||
also calculated in an attempt to exploit the likely stochastic nature of
|
||
murmurs when compared to normal heart sounds. Given the large number of
|
||
features calculated, PCA is used to retain only the most relevant information.
|
||
Quadratic discriminant analysis (QDA) is then used as a classifier to provide a
|
||
final accuracy score of 73\%.\\
|
||
|
||
An overview of significant research prior to the Physionet challenge is
|
||
provided in table~\ref{SumPrior}. It is also noted that none of the databases
|
||
used for prior research are publicly available.
|
||
|
||
|
||
\newgeometry{margin=1cm} % modify this if you need even more space
|
||
\begin{landscape}
|
||
\begin{table}[htbp]
|
||
\captionof{table}{Summary of research prior to the Physionet Challenge 2016}\label{PriorWorkTable}
|
||
\scriptsize
|
||
%\centering
|
||
\rowcolors{1}{gray!15}{white}
|
||
\label{SumPrior}
|
||
\doublespacing
|
||
\begin{tabulary}{\linewidth}{LLLLLL}
|
||
\dtoprule
|
||
Author & Pre-processing/segmentation & Features & Classification Method & Database & Reported Accuracy \\ \hline
|
||
Maglogiannis et.~al \citeyearpar{Maglogiannis2009} & Wavelet decomposition, Shannon energy peak picking & Features derived from wavelet decomposition and PCG segmentations & SVM & 198 recordings, 38 normal, 41 AS systolic murmur, 43 MR systolic murmur, 38 AR diastolic murmur, 38 MS diastolic murmur & $91.43\%\;Ac$ \\
|
||
Ari et~al. \citeyearpar{Ari2010} & Amplitude envelope peak picking~\parencite{Ari2007} & Wavelet based features & LSSVM & 64 patients, 64 recordings, 512 cycles & $88.750-100\%\;Ac$ (dependant on abnormality type) \\
|
||
Quiceno-Manrique et~al. \citeyearpar{Quiceno-Manrique2010a}& Downsampled to 4KHz, Normalised to maximum of signal, ECG assisted QRS complex detection algorithm used for segmentation & Spectral features derived from STFT, Wavelet decomposition and quadratic energy distributions & $k$-NN & 22 patients, 16 normal, 6 abnormal, 8 recordings (12s) per patient & $98\%\;Ac$ \\
|
||
Schmidt et~al. \citeyearpar{Schmidt2015} & Signal filtered into frequency bands, Segmented by HMM based method+hand corrected, removal of high variance sub-segments to remove noise & Parametric spectral features (AR, ARMA and Music), Instantaneous frequency and amplitude, Power in octave bands & QDA & 435 Recordings, 133 patients, 70 normal, 63 abnormal & $73\%\;Ac$ \\
|
||
Reed et~al. \citeyearpar{Reed2004} & --- & Wavelet decomposition coefficients, Manual feature reduction & ANN & 5 patients, 4 cycles per patient & $100\%\;Ac$ \\
|
||
\dbottomrule\\
|
||
\end{tabulary}
|
||
$Ac = \text{Accuracy}$
|
||
\end{table}
|
||
\end{landscape}
|
||
\restoregeometry
|
||
|
||
\subsubsection{Physionet challenge entries}\label{ChallengeEnt}
|
||
\doublespacing
|
||
The 2016 Physionet/CinC Challenge aimed to encourage development of heart
|
||
abnormality detection algorithms by providing a large open database of PCG
|
||
signal recordings, sourced from a variety of both clinical and non-clinical
|
||
environments. (Further details on the database can be found in
|
||
section~\ref{Database}. The complete specification is presented by Liu et al.~\parencite{Liu2016}). In addition, participants were provided with a
|
||
state-of-the-art heart sound segmentation algorithm, as proposed by Springer
|
||
et al.\ in Section~\ref{Segmentation}. Participants were then tasked with the
|
||
creation of a classification algorithm that could robustly discriminate between
|
||
healthy and unhealthy heart sound samples. The challenge received 348 entries
|
||
in total, each of which was scored on a hidden test database
|
||
using a Modified accuracy measure ($Acc$) as defined by Clifford et.
|
||
al~\parencite{Clifford2016}:
|
||
\begin{table}[htbp]
|
||
\centering
|
||
\caption{Output Classification}
|
||
\label{OutputClassification}
|
||
\doublespacing
|
||
\begin{tabular}{llccc}
|
||
\hline
|
||
& & \multicolumn{3}{c}{Algorithm's Output} \\ \hline
|
||
& & \multicolumn{1}{l}{Normal} & \multicolumn{1}{l}{Uncertain} & \multicolumn{1}{l}{Abnormal} \\
|
||
\multirow{4}{*}{Ground Truth} & Normal, clean & $Nn_1$ & $Nq_1$ & $Na_1$ \\
|
||
& Normal, noisy & $Nn_2$ & $Nq_2$ & $Na_2$ \\
|
||
& Abnormal, clean & $An_1$ & $Aq_1$ & $Aa_1$ \\
|
||
& Abnormal, noisy & $An_2$ & $Aq_2$ & $Aa_2$ \\ \hline
|
||
\end{tabular}
|
||
\end{table}
|
||
|
||
\doublespacing
|
||
|
||
Weights are calculated as:
|
||
\begin{table}[H]
|
||
\centering
|
||
\doublespacing
|
||
\begin{tabular}{ll}
|
||
$Wa_1 = \frac{\text{Clean abnormal recordings}}{\text{Total abnormal recordings}}$ & $Wa_2 = \frac{\text{Noisy abnormal recordings}}{\text{Total abnormal recordings}}$ \\
|
||
$Wn_1 = \frac{\text{Clean normal recordings}}{\text{Total normal recordings}}$ & $Wn_2 = \frac{\text{Noisy normal recordings}}{\text{Total normal recordings}}$
|
||
\end{tabular}
|
||
\end{table}
|
||
|
||
Modified sensitivity ($Se$), specificity ($Sp$) and overall accuracy ($Acc$) are then calculated as:
|
||
|
||
\begin{align*}
|
||
&Se=Wa_1\frac{Aa_1}{Aa_1+Aq_1+An_1}+Wa_2\frac{Aa_2+Aq_2}{Aa_2+Aq_2+An_2} \\
|
||
&Sp=Wn_1\frac{Nn_1}{Na_1+Nq_1+Nn_1}+Wn_2\frac{Nn_2+Nq_2}{Na_2+Nq_2+Nn_2} \\
|
||
&Acc=\frac{Se+Sp}{2}
|
||
\end{align*}
|
||
|
||
This section summarises some of the key works presented for the challenge,
|
||
including some of the most accurate models, and a baseline classifier
|
||
provided to participants as a starting point.\\
|
||
|
||
This baseline classifier was provided in order to demonstrate the basic
|
||
structure of systems expected for entries~\parencite{Liu2016}. The classifier
|
||
extracted a selection of 20 basic features, primarily focused on relative
|
||
timings and amplitudes of heart sounds. A binary logistic regression model was
|
||
chosen for classification. From the 20 extracted features, 13 were selected
|
||
based on their statistical significance, measured using forward likelihood ratio
|
||
selection. The system achieved a reported score of 66\% on the test set, giving
|
||
a baseline score for participants to build on. In addition, the system was
|
||
trained using leave-one-out cross validation. By removing a single training
|
||
database on each fold, the generalisation of the algorithm could then be
|
||
evaluated, training on all other databases. Results showed that performance
|
||
decreased significantly when training via this method, giving an average
|
||
accuracy of 59\%, with training database $b$ scoring as low as 47\%. This
|
||
could suggest that individual databases in the database are not sufficiently
|
||
represented by other databases, or that features do not model abnormalities
|
||
sufficiently.\\
|
||
|
||
Homsi et.\ al proposed a system that utilised 131 time domain, STFT-based and
|
||
wavelet-based features which, when combined with nested ensemble classifiers,
|
||
produced an accuracy score of 84.48\%~\parencite{Homsi2017}. This algorithm
|
||
combines many commonly-used features in previous PCG related literature such as
|
||
wavelet decomposition based features, MFCCs and Shannon Energy. The system also
|
||
uses a total of 40 classifiers, 20 for signals labelled to be `standard' and 20
|
||
for those labelled as `atypical'. A mixture of Random Forrest, LogitBoost and
|
||
Cost-Sensitive Classifiers (CSC) are used to classify signals in parallel.
|
||
Final results are combined using a rule-based decision, designed through manual
|
||
experimentation.\\
|
||
|
||
Potes et al.\ presented a similar approach to that of Homsi et
|
||
al.~\parencite{Potes2016}. 124 similar TFR features were extracted and used as
|
||
vectors for an AdaBoost classifier. This classifier was then combined with a
|
||
deep learning approach using a Convolutional Neural Network (CNN) classifier.
|
||
The signal was decomposed into 4 frequency bands and segmented, to provide
|
||
input to the CNN. Results from both AdaBoost and CNN classifiers were then
|
||
combined using a set decision rule. This method produced the highest score on
|
||
the test set for the challenge at 86.02\%.\\
|
||
|
||
Zabihi et al.\ took an alternative approach by choosing not to segment PCG data
|
||
in the pre-processing stage~\parencite{Zabihi2016}. This was with the intention
|
||
of reducing computational complexity of the resulting algorithm. In addition,
|
||
the proposed method utilizes a sequential forward feature selection (SFS) and
|
||
Linear Predictive Coefficients (LPC) for the reduction of features used for
|
||
classification. This benefits the system by removing correlated and irrelevant
|
||
features, thus reducing computational complexity and removing irrelevant noise
|
||
from feature vectors prior to training. Final classifications are determined
|
||
through cascaded ensembles of ANNs. The signal is first classified as either of
|
||
high or low sound quality, and then as normal or abnormal. The system achieved
|
||
a final score of 85.9\% on the hidden test set.\\
|
||
|
||
Plesinger et al.\ opted to develop a new form of machine learning algorithm
|
||
based on probability assessment~\parencite{Plesinger2017}. In this method,
|
||
features are mapped to histograms and thought of as probability distributions.
|
||
Weights are applied based on the number of occurrences of each feature, which in
|
||
turn generates a probability function. This can then be used to calculate the
|
||
estimated classification of a new data point. From the 228 extracted features,
|
||
53 features were then selected based on calculated sensitivity and specificity
|
||
scores using generated histograms. This allowed for the training scores to be
|
||
automatically optimized by the algorithm.\\
|
||
|
||
Kay et al.\ present a method using ANNs, a wide variety of features and PCA for
|
||
feature reduction~\parencite{Kay2017}. The algorithm scores well on the test
|
||
set. However, this work is most notable for it's rigorous evaluation by
|
||
authors, using leave-one-out cross validation for a clearer understanding of
|
||
the generalisation of the algorithm, as well as highlighting issues with the
|
||
underlying database that are discussed in Section~\ref{Database}
|
||
|
||
|
||
\newgeometry{margin=1cm} % modify this if you need even more space
|
||
\begin{landscape}
|
||
\begin{table}[H]
|
||
\captionof{table}{Summary of top 10 Physionet Challenge 2016 entries}
|
||
\label{PhysionetTable}
|
||
\scriptsize
|
||
%\centering
|
||
\rowcolors{1}{gray!15}{white}
|
||
\doublespacing
|
||
\begin{tabulary}{\linewidth}{CCCCC}
|
||
\dtoprule
|
||
Author & Features & Classification Method & Reported Scores & Challenge Score \\ \midrule
|
||
Potes et.~al \citeyearpar{Potes2016} & 124 TFR features & Combined AdaBoost/ANN & In-house test set accuracy: AdaBoost-abstain: 79\%, CNN: 82\%, Combined classifiers: 85\% & 86.02\% \\
|
||
Zabihi et.~al \citeyearpar{Zabihi2016} & 40 temporal, spectral and TFR features, reduced using SFS and LPC & 2 ensembles of neural networks & Training accuracy: Maximum of 91.50\% & 85.90\% \\
|
||
Kay et.~al \citeyearpar{Kay2017} & CWT, MFCCs, complexity measures, Inter-beat features, PCA & ANNs & A range of cross validation based tests were used to analyse performance. See paper for full details & 85.20\% \\
|
||
Bobillo \citeyearpar{Bobillo2016} & MFCCs and WPD, reduced using tensor decomposition & $k$-NN & A range of cross validation based tests were used to analyse performance. See paper for full details & 84.54\% \\
|
||
Homsi et.~al \citeyearpar{Homsi2017} & 131 time domain, STFT based andwavelet based features & Combined ensembles of LogitBoost, Random Forrest and CSC & Training accuracy 87.7\%, In-house test accuracy: 93.24\% & 84.48\% \\
|
||
Maknickas et.~al \citeyearpar{Maknikas2017} & MFCCs, reduced by Karhunen–Loeve transform & Deep Neural Network & Training accuracy 99.7\%, Validation accuracy 95.2\% & 84.15\% \\
|
||
Plesinger et.~al \citeyearpar{Plesinger2017} & Statistical and symmetry properties of amplitude envelopes for S1 and S2 sounds & Custom probability assessment machine learning algorithm & Training accuracy 90.3\% & 84.11\% \\
|
||
Rubin et.~al \citeyearpar{Rubin2016} & MFCCs & Convolutional neural networks & -- & 83.99\% \\
|
||
Jiayu (paper not submitted) & -- & -- & -- & 82.82\% \\
|
||
Abdollahpur et.~al \citeyearpar{Abdolahpur2017} & time, TFR and perceptual features, reduced using Fisher's discriminant analysis & Combined ANNs & Training accuracy: 91.6\%, 87\%, 84.55\% (prior to ANN combination method) & 82.63\%\\
|
||
\dbottomrule\\
|
||
|
||
\end{tabulary}
|
||
\end{table}
|
||
\end{landscape}
|
||
\restoregeometry
|
||
\doublespacing
|
||
\section{Database}\label{Database}
|
||
%TODO: Briefly describe what is needed from a database for this project
|
||
A database representative of real-world PCG signals was needed to train models
|
||
and evaluate the proposed method effectively. A number of criteria were
|
||
identified as necessary for the success of the proposed project:
|
||
\begin{itemize}
|
||
\item The database must contain sufficient PCG data, so that a model
|
||
trained to discriminate between said signals would, in theory, generalise
|
||
to new PCG data.
|
||
\item The database should contain a mixture of clean and noisy signals that
|
||
represent a variety of real world situations, as real-world
|
||
classification would likely be performed in sub-optimal conditions. If
|
||
this is not possible, noise could potentially be added to clean signals
|
||
to simulate this.
|
||
\item It must be possible to differentiate healthy signals from a variety of
|
||
individual pathologies, in order to provide a general abnormality
|
||
detection algorithm. This should be reflected in the database through
|
||
inclusion of a variety of signals representing different pathological
|
||
heart conditions.
|
||
\item Data must be reliably labelled in order to generate a reliable model
|
||
(particularly when using machine learning methods, as in the proposed
|
||
project). Labels should ideally be verified by a trained professional.
|
||
\end{itemize}
|
||
\noindent
|
||
Two viable options were then considered based on the above criteria:
|
||
\begin{enumerate}
|
||
\item The Physionet challenge database
|
||
\item Generation of a synthetic dataset via methods such as that proposed
|
||
by Almasi et al.~\parencite{Almasi2011}
|
||
\end{enumerate}
|
||
|
||
Generation of synthetic data was considered, as few well-formed alternative
|
||
databases exist, other than the Physionet challenge data. The database curated
|
||
for the Physionet challenge was selected for this project, as it fulfilled the
|
||
criteria sufficiently and posed less of a risk in terms of signal quality, due
|
||
to all signals being produced in real-world environments. However, synthesis
|
||
of PCG data remains an interesting possibility for improving evaluation of
|
||
classification systems and could be considered for the generation of additional
|
||
samples in future work.
|
||
|
||
\subsection{Database summary}
|
||
The selected database is significantly larger and contains a wider variety of
|
||
signal conditions than any database used for previous research (as detailed in
|
||
table~\ref{PriorWorkTable}). It is released as an open-source resource and is
|
||
documented in significant detail by Liu et al.~\parencite{Liu2016}. The lack of
|
||
any alternative databases, comparable in size or variety of content, perhaps
|
||
makes this resource the current standard for PCG analysis projects. In
|
||
addition, by replicating the conditions of the Physionet challenge, results can
|
||
be directly compared with those of the challenge participant's, with the aim of
|
||
understanding how the proposed algorithm compares to the current state of PCG
|
||
analysis.
|
||
|
||
\begin{itemize}
|
||
\item The database consists of 6 sub-databases, labelled $a$ to $f$.
|
||
\item These sub-databases have been sourced from a variety of professionals,
|
||
over the course of a decade.
|
||
\item A total of 3,240 recordings are included, created using varying equipment.
|
||
\item 2575 recordings are labelled as normal, 665 are labelled as abnormal.
|
||
\item All samples have been resampled to 2KHz
|
||
\item Samples were recorded in a range of both clinical and
|
||
non-clinical environments.
|
||
\item Many recordings are corrupted with environmental noise, such as
|
||
microphone friction, breathing, talking etc\ldots
|
||
\item Sections of silence are present in some recordings, most
|
||
significantly in database $e$.
|
||
\end{itemize}
|
||
|
||
\subsection{Considerations}\label{DBCons}
|
||
There are a number of issues with the acquired database that have been
|
||
highlighted, both through previous literature and through development of the
|
||
proposed system. These have been considered throughout development and evaluation of
|
||
the project.\\
|
||
A significant issue highlighted by Liu et al.\ is the large number of normal
|
||
recordings compared to pathological recordings. This creates a clear class
|
||
imbalance issue that can result in over-inflated classification
|
||
results. This is considered in
|
||
Section~\ref{Resample}.\\
|
||
Another key issue is the difference between the databases used by participants of the
|
||
Physionet challenge, and the available data that was acquired for this project.
|
||
For unknown reasons, information such as patient labels used for training many
|
||
of the challenge participant's models have not been made publicly available and
|
||
so could not be used for training of the proposed system.\\
|
||
The lack of access to the hidden test set used for evaluating challenge entries
|
||
also had a significant impact on evaluation. An alternative method for
|
||
evaluating using only the data provided has been proposed in
|
||
Section~\ref{metrics}.\\
|
||
Finally, an issue is highlighted by Bobillo with regards to database
|
||
$e$~\parencite{Bobillo2016}. The recording of normal and pathological signals using
|
||
separate devices is likely to cause issues and is discussed in
|
||
Section~\ref{Eval}.
|
||
|
||
%BEGIN NEW MATERIAL
|
||
\pagebreak
|
||
\section{Design}
|
||
This project aims to provide robust heart abnormality detection for PCG
|
||
signals, such that use of the system could reliably recommend further medical
|
||
attention when necessary. It is clear from previous research that machine
|
||
learning methods for classification have shown the most promise in this area,
|
||
and that ensemble methods have been largely successful in improving
|
||
classification accuracy of base classifiers~\parencite{Homsi2017, Potes2016}.
|
||
However, one such method that has recently shown significant success in other
|
||
fields, but is not present in recent PCG analysis literature, is the stacking
|
||
meta-classifier~\parencite[p.498]{Tobergte2013a}. The presented system was
|
||
therefore designed to explore the potential of this classification method in
|
||
the context of PCG signal classification. This section details the four key
|
||
components developed to form the final system: signal preprocessing
|
||
(Section~\ref{preprocessing}), audio feature extraction (Section~\ref{featEx}),
|
||
classification (Section~\ref{class}) and optimisation (Section~\ref{optimise}).
|
||
|
||
% TODO: Create flow diagram Preprocessing -> Feature extraction ->
|
||
% Model optimisation -> Performance evaluation
|
||
|
||
\subsection{Preprocessing}\label{preprocessing}
|
||
It quickly became apparent that, due to significant variations in the available
|
||
data (as a result of noise, variations in recording equipment etc...), the
|
||
effective preprocessing of such data would be a critical factor when designing
|
||
the system. This section details the most significant preprocessing steps
|
||
taken, in order to both minimize noise and extract the basic structure of the
|
||
signal.
|
||
|
||
\subsubsection{Signal decimation}
|
||
A common method employed to simultaneously reduce computation time and remove
|
||
extraneous information is to decimate the input signal by an integer factor.
|
||
According to Shannon sampling theorem, a digital signal can only represent
|
||
frequency content up to half the sample rate of the signal (the nyquist
|
||
rate)~\parencite[p.140]{Kadis1999}
|
||
Therefore, by removing every $n$th sample, high frequency content can be
|
||
removed whilst lowering the number of samples that must be processed in
|
||
subsequent operations. An anti-aliasing filter must also be applied to the
|
||
signal in order to filter harmonic distortion generated by the process.
|
||
As it is commonly stated in the literature that little relevant information in
|
||
PCG signal is found above 400Hz, all signals were resampled to 1KHz giving a
|
||
500Hz cutoff frequency, using an 8th order zero-phase Chebyshev type I filter.
|
||
|
||
\subsubsection{Dataset resampling}\label{Resample}
|
||
A common issue with data collected from the real world (ie. non-synthetic data)
|
||
is the imbalance of classes in data. As noted by Liu et
|
||
al.~\parencite{Liu2016}, this is the case with the available dataset, as there
|
||
are less pathological signals than healthy signals. This presents an issue
|
||
with classification tasks, as imbalance can have a negative impact on
|
||
classification of the minor class. In this context, class imbalance could
|
||
potentially impact classification accuracy for abnormal samples, so must be
|
||
handled appropriately. This issue can be approached using a number of methods.
|
||
Sophisticated oversampling methods such as SMOTE (Synthetic Minority
|
||
Oversampling Technique) offer one solution. SMOTE generates synthetic samples
|
||
using interpolation and adds these to the data set to balance the classes,
|
||
without using direct copies of existing data. However, oversampling techniques
|
||
such as this can increase overfitting of models, and don't always offer
|
||
reasonable improvement in performance~\parencite{Longadge2013}. Undersampling
|
||
is the most common method used, typically by randomly removing samples from the
|
||
major class. This has the obvious disadvantage of reducing data available for
|
||
training. However, an improved method using $k$-Means clustering has been shown
|
||
to be effective in previous cardiovascular classifications
|
||
problems~\parencite{Rahman2013}. This method was seen to be the best choice for
|
||
the proposed system. This method is illustrated using a small generated
|
||
2-dimesional dataset in Figure~\ref{cent}.
|
||
|
||
\begin{figure}[H]
|
||
\caption[caption of centroid]{Example resampling of synthesised dataset using cluster centroids\footnotemark}
|
||
\makebox[\textwidth]{\includegraphics[width=\textwidth]{centroid}}
|
||
\label{cent}
|
||
\end{figure}
|
||
\footnotetext{This figure was adapted from: \url{http://contrib.scikit-learn.org/imbalanced-learn/stable/}}
|
||
|
||
\subsubsection{Signal segmentation}
|
||
With one notable exception~\parencite{Langley2016}, previous classification
|
||
algorithms rely heavily on the ability to segment signals into the four
|
||
fundamental heart sounds. This is a key prerequisite to the extraction of
|
||
relevant features. The defining of signal structure allows for the
|
||
relationships between it's components to be analysed, as described in
|
||
Section~\ref{featEx}. To facilitate the development of robust algorithms for
|
||
the Physionet challenge, participants were provided with an implementation of
|
||
Springer's HSMM based segmentation algorithm. As the highest scoring algorithm
|
||
in the literature, it was clearly the most suitable algorithm to use for the
|
||
proposed system. In addition to the high accuracy of segmentation, the wide
|
||
adoption of this algorithm is beneficial for comparison with other algorithms
|
||
submitted to the challenge. Results produced by the proposed system will
|
||
generally not be coloured by the differences in quality of segmentation
|
||
algorithms, allowing for more direct comparison of classification methods.
|
||
However, it is noted that despite the high performance of the algorithm, errors
|
||
in segmentation will still occur, that may have a negative impact on feature
|
||
quality. As methods proposed by previous literature, such as hand correction by
|
||
a professional~\parencite[p.2203]{Liu2016} are not feasible in this context,
|
||
and considering the low number of erroneous results produced by the
|
||
algorithm~\parencite[p.2]{Goda2016} it was decided that these errors would not
|
||
pose a significant problem. An illustration of PCG data segmentation can be
|
||
seen in Figure~\ref{segs}.
|
||
|
||
\begin{figure}[H]
|
||
\caption{Example segmentation of PCG data}
|
||
\makebox[\textwidth]{\includegraphics[width=\textwidth]{segs}}
|
||
\label{segs}
|
||
\end{figure}
|
||
|
||
|
||
|
||
\subsection{Feature extraction}\label{featEx}
|
||
The extraction of feature vectors from data is a fundamental component of most
|
||
machine learning based systems. The aim is to construct meaningful
|
||
representations of the data that emphasize information relevant to the
|
||
classification problem. In the proposed project, 188 features were extracted
|
||
from the data, procuring feature extraction techniques from a wide range of
|
||
previous literature, as well as using novel perceptual features commonly found
|
||
in audio/music analysis (See Sections~\ref{FFT} and~\ref{Time}).
|
||
There are also potential issues that can occur when using large sets of
|
||
features for training. The method proposed for addressing these issues is
|
||
discussed in section~\ref{SFS}. This section provides a summary of the main
|
||
feature categories. Please refer to appendix~\ref{appendixA} for a full
|
||
breakdown of all features.
|
||
|
||
\subsubsection{Time-domain features}\label{Time}
|
||
A range of features were generated, based directly on the time series data.
|
||
Features such as:
|
||
\begin{itemize}
|
||
\item Average and standard-deviation of segment intervals, for all heart
|
||
sounds and complete heart cycles.
|
||
\item Ratio of systolic and diastolic period to total heart cycle period.
|
||
\item A range of statistical features such as entropy, skewness and variance for
|
||
each heart sound.
|
||
\item A selection of envelope based features for each heart sound.
|
||
\end{itemize}
|
||
|
||
18 features provided by the Physionet challenge focused on timings between
|
||
segments of the heart cycles. It was thought that these features would be
|
||
useful in capturing irregularities caused by conditions such as arrhythmias,
|
||
atrial septal defect and other conditions that are likely to affect relative
|
||
timing of heart sounds.
|
||
Many conditions that can be detected by traditional auscultation are
|
||
characterised by an increase in loudness of the S1 and/or S2 heart
|
||
sounds~\parencite{Brown2008}. This suggests that features relating to human
|
||
perception of loudness may aid in the detection of such conditions. Simple
|
||
envelope based features such as RMS, peak loudness and the Shannon energy
|
||
envelope (Equation~\ref{ShanEQ}) that proved popular in previous literature,
|
||
were extracted for this reason~\parencite[p.73-77]{Lerch2012}. In addition,
|
||
statistical features such as sample entropy and skewness (Equation
|
||
~\ref{SkewEQ}) were used to evaluate the distribution of samples for each heart
|
||
sound, these were selected to provide a representation of the temporal
|
||
``shape'' of each sound.
|
||
|
||
\begin{equation}\label{ShanEQ}
|
||
SE = \frac{-1}{N}\sum\limits_{n=0}^N x(n)^2\cdot \log{x(n)^2}
|
||
\end{equation}
|
||
\begin{equation}\label{SkewEQ}
|
||
S=\frac{E(x-\mu)^3}{\sigma^3}
|
||
\end{equation}
|
||
Where:\\
|
||
$x(n)$ is the input signal\\
|
||
$E(t)$ is the expected value\\
|
||
$\mu$ is the mean of the signal\\
|
||
$\sigma^2$ is the variance of the signal
|
||
|
||
\subsubsection{FFT-based features}\label{FFT}
|
||
It was recognised that a time domain representation alone was unlikely to
|
||
provide a sufficient representation for discerning a wide variety of
|
||
conditions. Using a time-frequency representation to characterise the spectral
|
||
components of the signal has proven effective in the majority of literature.
|
||
The classic method for producing a spectral representation of a signal is to
|
||
apply the Discrete Fourier Transform (DFT) (as defined in Equation~\ref{FFTEQ})
|
||
over a sliding window of size $N$. By decomposing the signal into a series of
|
||
sine and cosine waves, a representation of the signal's spectral content across
|
||
a range of frequency bands is produced. This can be used for further analysis
|
||
of heart sounds based on their spectral characteristics.
|
||
\begin{equation}\label{FFTEQ}
|
||
X(k)=\sum\limits_{n=0}^{N}x(n)e^{\frac{-j2\pi kn}{N}}
|
||
\end{equation}
|
||
Where $x(n)$ is the input signal\\
|
||
Features generated using this representation would, in theory, be useful for
|
||
identifying conditions that reside in specific frequency bands, such as
|
||
murmurs, for example~\parencite{Sepehri2010}.\\
|
||
|
||
An example of such features are Mel-Frequency Cepstrum Coefficients (MFCCs).
|
||
Popular in speech processing, MFCCs provide a compact representation of a
|
||
signal's spectral shape. MFCCs are calculated by first applying $N$ (a
|
||
user-defined parameter) triangular filter banks, spaced using the mel scale to
|
||
the magnitude spectrum. Applying a discrete cosine transform to the log of the
|
||
filterbank outputs provides the final set of coefficients (for further details,
|
||
please refer to~\parencite{Lerch2012}). This analysis
|
||
creates a perceptually relevant representation of spectral shape, in effect
|
||
mimicking the way in which humans might perceive the spectral shape of heart
|
||
sounds. The reasoning for this is that, as the aim is to provide a system with
|
||
performance better than, or equal to to that of a human, features that mimic
|
||
what a human perceives may prove effective at distinguishing conditions in the
|
||
way that a human does. This has shown to be effective in previous literature,
|
||
with multiple systems utilising perceptual features with
|
||
success~\parencite{Ortiz2016, Rubin2016, Quiceno-Manrique2010a}. 13 MFCCs were
|
||
calculated for each heart sound and averaged to provide 13 features
|
||
per sample.\\
|
||
%TODO: Generate MFCC spectum
|
||
|
||
In addition to MFCCs, other statistical features were extracted from the
|
||
spectrum such as spread, skewness, kurtosis and flatness. These features aim to
|
||
provide alternate spectral measurements to MFCCs, in a similar way to their
|
||
temporal counterparts as described in Section~\ref{Time}.
|
||
|
||
Although the Fourier representation of PCG signals has proven effective in many
|
||
cases, there are drawbacks of this representation that must be considered. One
|
||
key issue that is inherent of Fourier transforms is the time-frequency
|
||
trade-off. An increase in frequency resolution will always result in a decrease
|
||
in temporal resolution. This poses a problem, as it is not possible to localize
|
||
transient events accurately in the frequency domain using this method. This
|
||
method may also suffer in the presence of background noise common in PCG
|
||
signals. Previous studies have shown that these factors may have a significant
|
||
impact when detecting conditions such as Coronary
|
||
Stenoses~\parencite{Ergen2001, Akay1990}
|
||
|
||
\subsubsection{Wavelet decomposition features}
|
||
The wavelet transform has been used effectively as an alternative
|
||
time-frequency representation to Fourier methods. The fundamental concept of
|
||
the wavelet transform is to represent an input signal as a set of scaled and
|
||
shifted finite oscillations. By comparing the signal with each scale of wavelet
|
||
at all points in time, a set of $N\times A$ (Where $A$ is the number of scales)
|
||
coefficients are generated. These define the scale and position needed for
|
||
each wavelet in order to fully reconstruct the signal (This is illustrated in Figure~\ref{wave}. For further details,
|
||
refer to~\parencite{Polikar1994}). The benefit of this transform is that it is
|
||
well localized in both time and frequency domains. This allows for accurate
|
||
representation of transient events such as clicks and snaps that are
|
||
characteristic of heart conditions such as Mitral valve prolapse or
|
||
stenosis~\parencite{Brown2008}.\\
|
||
For the proposed system, a 5 level DWT using daubechies wavelets-4 mother wavelet was
|
||
used for decomposition and reconstruction. Statistical features such as entropy
|
||
were then calculated, both on the reconstructed signal and directly on
|
||
coefficients to attain a total of 48 features.~\parencite{Homsi2016}
|
||
% TODO: Insert wavelet diagram here
|
||
|
||
\begin{figure}[H]
|
||
\caption{Example 5 level daubechies 4 wavelet decomposition and reconstruction (normalised). Plots in descending order: D1, D2, \ldots, D5, A1}
|
||
\makebox[\textwidth]{\includegraphics[width=1.0\textwidth]{wavelet}}
|
||
\label{wave}
|
||
\end{figure}
|
||
|
||
\subsubsection{Feature scaling and imputing}
|
||
A common problem when working with multiple features is the difference in scale
|
||
between features. This problem can cause many machine learning algorithms to place
|
||
bias on larger scale features and can significantly impact the time taken for
|
||
certain algorithms to converge. This is particularly significant when applying
|
||
algorithms sensitive to feature scale such as SVMs (described in
|
||
Section~\ref{SVM}). To address this, a Min-Max scaler was applied
|
||
to training and test sets prior to training models. This scales all values to within a
|
||
0--1 range producing a set of features on a common scale.\\
|
||
It is also common to encounter missing values in features. These can occur as a
|
||
result of $\log(0)$ or division by 0 calculations, amongst other edge cases. A
|
||
standard method for handling these values is to apply an imputer, replacing
|
||
values with the mean of the feature vector~\parencite{VanderPlas2017}.
|
||
|
||
\subsection{Stacking classifier with cross-validation}\label{class}
|
||
The stacking classifier is an ensemble classifier, that uses the results of
|
||
multiple base classifiers as input to a 2nd level meta-classifier, which in
|
||
turn is used to generate a final prediction. $k$-fold cross validation is used
|
||
across base classifiers, training on $k-1$ folds of input data, and applying
|
||
to the remaining validation set. The results of these predictions from each
|
||
base classifier are combined and used to train the 2nd level classifier which
|
||
produces the final predictions based on the probabilities and predictions
|
||
provided. This is illustrated in figure~\ref{stack}\\
|
||
Given it's proven accurate performance across a range of tasks, it was
|
||
expected that this classification model could be applied effectively to produce
|
||
an alternative method for abnormality detection than those presented in
|
||
previous literature.
|
||
% TODO:Insert stacking classifier diagram
|
||
|
||
\begin{figure}[H]
|
||
\caption[caption of stack]{Stacking classifier overview\footnotemark}
|
||
\makebox[\textwidth]{\includegraphics[width=0.5\textwidth]{stacking_cv_classification_overview}}
|
||
\label{stack}
|
||
\end{figure}
|
||
\footnotetext{Figure retrieved from:\url{http://rasbt.github.io/mlxtend/user_guide/classifier/StackingCVClassifier/}}
|
||
|
||
\subsubsection{Base classifiers}
|
||
Clearly, an important consideration when using any ensemble method is the
|
||
selection of the base classifiers. In order for any ensemble method to perform
|
||
well, it must be constructed using a selection of classifiers that individually
|
||
provide useful models for the data~\parencite[p.484]{Tobergte2013a}. A wide
|
||
variety of models were considered for use as base and meta models including
|
||
models such as Tree based, $k$-Nearest Neighbour, and AdaBoost classifiers.
|
||
Selection of these models was based on a novel approach using hyperparameter
|
||
optimisation as discussed in Section~\ref{optimise}. The following sections
|
||
detail the 3 final models selected by the optimisation algorithm; A combination
|
||
of SVM and Naive-Bayes classifiers, with a Logistic Regression meta classifier.
|
||
|
||
\paragraph{SVM}\label{SVM}
|
||
The SVM classifier aims to fit a hyperplane to data that maximises the
|
||
separability between classes. This results in a model that has been shown to
|
||
generalise well in many cases, as maximising separability between classes is
|
||
also likely to increase the margin for error in separation of classes. This
|
||
type of classifier is also able to generate hyperplanes in non-linear space,
|
||
using a techniques known as `kernel tricks'. This works by mapping linear data
|
||
to a higher dimension, allowing non-linearly separable classes to be separated
|
||
by the same method. The details of the SVM and Kernel-SVM are complex and
|
||
outside the scope of this report. Further information can be found
|
||
in~\parencite[p.187]{Tobergte2013a}.\\
|
||
SVMs have been prevalent in previous literature, shown to be effective in
|
||
separation of a variety of heart conditions~\parencite{Ari2010} The use of
|
||
kernels to map parameters to higher dimensions is a key advantage of this
|
||
model, allowing for non-linear relationships that are likely to be present in
|
||
the large variety of features to be well represented in classification. Choice
|
||
of kernels, and relevant hyperparameters is detailed in Section~\ref{PSOp}.
|
||
|
||
\paragraph{Naive-Bayes}
|
||
Commonly used in text classification problems, where there is typically a
|
||
high-dimensional feature space, Naive Bayes classification uses Bayes rule to
|
||
determine the probability of classification, given a vector of features. This
|
||
is calculated as:
|
||
\begin{equation}
|
||
P(y\mid x_1,\ldots,x_n)=\frac{P(y)\prod\limits_{i=1}^{N}P(x_i\mid y)}{P(x_1,\ldots,x_n)}
|
||
\end{equation}
|
||
The implementation used assumes a Gaussian distribution for all features,
|
||
calculating the probability of a feature as:
|
||
\begin{equation}
|
||
P(x_i\mid y)=\frac{1}{\sqrt{2\pi
|
||
\sigma_y^2}}\exp\bigg(-\frac{(x_i-\mu_y)^2}{2\sigma^2_y}\bigg)
|
||
\end{equation}
|
||
Where:\\
|
||
$\mu$ is the mean of the distribution\\
|
||
$\sigma^2$ is the variance\\
|
||
Using Maximum Likelihood estimation to estimate $\sigma$ and $\mu$ given the
|
||
feature vector, a classification for new features can then be calculated as:
|
||
\begin{equation}
|
||
\hat{y}=\argmax\limits_y P(y)\prod\limits_{i=1}^nP(x_i\mid y)
|
||
\end{equation}
|
||
Where:\\
|
||
$x$ is the feature vector to be classified\\
|
||
$\hat{y}$ is the estimated classification\\
|
||
|
||
Despite their computational simplicity, Naive Bayes classifiers have been shown
|
||
to produce highly accurate classifications models. The assumption that each feature is
|
||
completely independent allows for extremely fast classification and scalability
|
||
to large datasets, with many dimensions~\parencite[p.300]{Zhang2004}. It was
|
||
thought that these benefits would make the classifier suitable for the proposed system, as the relatively high
|
||
dimensionality of features and quantity of data points could then be classified
|
||
quickly, to obtain initial results. Despite the inclusion of more complex
|
||
models, this model was chosen via automatic selection for the final model.
|
||
Refer to section~\ref{PSOp} for further details.
|
||
|
||
\paragraph{Logistic regression}
|
||
Logistic regression is a regression model that aims to fit as hyperplane to
|
||
data points by minimizing a cost function using weighted features.
|
||
By applying weights to feature vectors then applying a sigmoid function, a
|
||
hypothesis function is defined as:
|
||
\begin{equation}
|
||
h_\theta(x)=\frac{1}{1-e^{-\theta^{T}x}}
|
||
\end{equation}
|
||
Where:\\
|
||
$x$ is a feature vector\\
|
||
$y$ is a class label vector \\
|
||
$\theta$ is a weight vector \\
|
||
A cost function can then be defined as:
|
||
\begin{align}
|
||
&J(\theta)=\argmin\limits_\theta\frac{1}{2m}\sum\limits_{i=1}^m\Big(h_\theta(x^{(i)})-y^{(i)}\Big)^2+\text{Regularization}(\theta)\\
|
||
&\text{Regularization}{(\theta)}_\text{L1}=\lambda\sum\limits_{j=1}^n\mid\theta_i\mid\\
|
||
&\text{Regularization}{(\theta)}_\text{L2}=\lambda\sum\limits_{j=1}^n\theta_i^2
|
||
\end{align}
|
||
Where:\\
|
||
$\lambda$ is the regularization parameter used to help prevent overfitting\\
|
||
By minimizing the cost function, classification predictions can then be made
|
||
using the hypothesis function~\parencite{Ng2012}.\\
|
||
Logistic regression was chosen as the meta-classifier primarily due to it's
|
||
simplicity and performance in testing. It is thought that this algorithm
|
||
performed particularly well as output from base classifiers was linearly
|
||
separable and relatively simple (in comparison to input features). The choice of
|
||
meta-classifier is a potential area for improvement and it is noted that a
|
||
range of meta-classifiers have been proposed for different tasks that utilise
|
||
stacking~\parencite[p.29]{Sesmero2015}. Further work in this area could
|
||
potentially provide improved results.
|
||
|
||
\subsection{Model optimisation}\label{optimise}
|
||
As discussed in previous sections, two of the most important aspects that affect
|
||
the performance of a classification system are it's models, and the input
|
||
features. A combination of relevant features and well tuned models is therefore
|
||
likely to provide an accurate classification system. However, it is not always
|
||
immediately clear which values to choose for parameters, or which features to use as
|
||
inputs. This is especially true when given such a wide selection of models to
|
||
choose from, and such high dimensional feature spaces, as are used in the
|
||
proposed system. To address this issue, two automatic optimisation approaches
|
||
were implemented, with the aim of maximising the accuracy of the proposed
|
||
system.
|
||
|
||
\subsubsection{Sequential feature selection}\label{SFS}
|
||
It was recognised that the extraction of such large numbers of features in the
|
||
proposed system would likely result in a large amount of redundant information.
|
||
There are two commonly used methods for addressing this problem: feature
|
||
reduction and feature selection. Feature reduction involves reducing features
|
||
to a lower dimensionality using techniques such as PCA. Conversely, feature
|
||
selection involves selectively removing features entirely via methods such as
|
||
Sequential Floating Forward Selection (SFFS). Both aim to reduce the amount of
|
||
redundant information in features by removing or reducing features that are not
|
||
expected to benefit the model. As a selection of models were to be used, each
|
||
potentially handling dimensionality differently (SVMs in particular), it was
|
||
decided that feature selection would be most appropriate for this
|
||
application.\\
|
||
|
||
Through experimentation, the chosen method was SFFS. This method is an adaptation
|
||
of tradition sequential forward selection, that also uses sequential backward
|
||
selection to allow for subsequent removal of added features when necessary.
|
||
SFFS is an iterative wrapper method that adds features and re-trains the chosen
|
||
model sequentially, choosing features that increase the accuracy of the model
|
||
output (using 3-fold cross validation to avoid overfitting). Final models used
|
||
as few as 40 features, increasing both accuracy of classifications and
|
||
computation time of models significantly.\\
|
||
It should be understood that this method does not guarantee a globally optimal
|
||
set of features. An exhaustive feature selection algorithm is capable of this
|
||
but this would incur significant computational cost. For further details on
|
||
SFFS please refer to~\parencite[p.3]{Ferri1994}
|
||
|
||
\subsubsection{Particle swarm optimisation}\label{PSOp}
|
||
The particle swarm optimisation algorithm is an iterative meta-heuristic algorithm that
|
||
aims to find the set of parameters that maximises a given function. Given a
|
||
$n$ dimensional parameter space, the algorithm randomly initialises sets of
|
||
`particles' representing random combinations of parameters. As the algorithm
|
||
progresses particles travel through the parameter space, updating their
|
||
position based on their velocity, best historical score and the best historical
|
||
score of the swarm. As the algorithm iterates, particles will converge on local
|
||
optima, producing potential solutions. The best score is chosen after the final
|
||
iteration as the best parameter selection. Annotated pseudocode for this
|
||
algorithm is shown in code block~\ref{PSCode}~\parencite{Clerc2002}
|
||
|
||
\onehalfspacing
|
||
\begin{lstlisting}[escapeinside={(*}{*)}, label={PSCode}, caption={Particle
|
||
Swarm Optimisation Pseudocode}]
|
||
Do
|
||
//For all particles...
|
||
For (*$i$*)=1 to Population Size
|
||
// If the current function score is better than the historical
|
||
// function score for the current particle, store new position
|
||
if (*$f(\overrightarrow{x}_i)>f(\overrightarrow{p}_i)\text{ then }\overrightarrow{p}_i = \overrightarrow{x}_i$*)
|
||
// Store best position of all particles in neighbourhood
|
||
(*$\overrightarrow{p}_g=\text{max}(\overrightarrow{p}_{\text{neighbors}})$*)
|
||
// For each dimension in the parameter space...
|
||
For (*$d=1$*) to Dimension
|
||
// Update velocities
|
||
(*$v_{id}=v_{id}+\phi_1(p_{id}-x_{id}+\phi_2(p_{gd}-x_{id})$*)
|
||
// Ensure particle velocity is within limit
|
||
(*$v_{id}=\text{sign}(v_{id})\cdot \text{min}(\text{abs}(v)_{id}, v_{\text{max}}))$*)
|
||
// Update particle position
|
||
(*$x_{id}=x_{id}+v_{id}$*)
|
||
Next (*$d$*)
|
||
Next (*$i$*)
|
||
Until termination criterion is met
|
||
\end{lstlisting}
|
||
\doublespacing
|
||
|
||
The use of this algorithm allowed for the efficient optimisation of all
|
||
parameters relating to the stacking classifier and it's base classifiers,
|
||
resulting in a finely tuned classification model that would not have been
|
||
producible using traditional trial and error methods to search for optimal
|
||
parameters.\\
|
||
During the initial design phase, it was found that the abundance
|
||
of machine learning algorithms available make selection of the optimal model
|
||
difficult, requiring in depth knowledge of a range of machine learning
|
||
techniques. A novel approach used by recent stacking classifier applications
|
||
has been in the use of meta-heuristic algorithm to select models automatically,
|
||
in addition to tuning parameters. By thinking of the base classifiers as
|
||
hyperparameters themselves, models can be swapped in and tuned automatically by
|
||
the particle swarm algorithm to provide a locally optimal selection of base
|
||
classifiers for the model~\parencite{Sesmero2015}. This technique was used to
|
||
pick the 3 final models described in section~\ref{class} from a selection of 6
|
||
models\footnote{The available models for selection were: Random Forrest,
|
||
$k$-Nearest Neighbour, Logistic Regression, Naive-Bayes, SVM (kernel based and
|
||
linear models) and AdaBoost classifiers, implemented using Scikit-learn}. This
|
||
dynamic selection of models was seen to be one of the key contributors to the
|
||
overall success of the algorithm. As with the chosen feature selection
|
||
algorithm, this method of optimisation is not guaranteed to provide a globally
|
||
optimal solution. It was thought that for the proposed system a locally optimal
|
||
system would suffice, particularly given the highly complex parameter space
|
||
used in implementation. This is discussed in detail in Section~\ref{ModOp}.
|
||
|
||
\subsection{Model performance evaluation method}\label{metrics}
|
||
In order to fully understand the performance of the system (and to evaluate the
|
||
impact of design decisions throughout development), a group of scoring methods
|
||
were implemented to test the system's performance in a selection of scenarios.
|
||
The aim was to provide reliable metrics that would highlight the system's
|
||
strength and weaknesses and to provide quantifiable measures with which to
|
||
compare the system to the range of alternative methods proposed in the
|
||
literature.\\
|
||
|
||
One of the most basic metrics was the scoring of the trained model on a
|
||
separate hold-out dataset. By reserving a selection of samples from across the
|
||
databases, a trained model could then be scored on this dataset for accuracy, sensitivity and
|
||
specificity (metrics described in Section~\ref{ChallengeEnt}) to determine the
|
||
system's performance on an unseen set of samples. This method is widely used to
|
||
provide a basic understanding of a model's ability to generalise to new data, a
|
||
crucial requirement of the system. Data was split using a grouped stratified shuffle
|
||
split, grouping by database. This ensured an equal number of randomly selected
|
||
classes were taken from each database to produce training and test sets. This
|
||
approach was taken to avoid class imbalance issues caused when there is a
|
||
significant difference in class frequency, as detailed in
|
||
Section~\ref{Resample}. It should be noted that although samples were
|
||
stratified by class, it was not possible to stratify samples by patient. This
|
||
may have an impact on results, as the presence of data from the same patient in
|
||
both training and test set may artificially inflate results, where the model has
|
||
learnt patterns specific to that patient that do not generalise to others.\\
|
||
|
||
A more robust method for for model evaluation is cross-validation. By splitting
|
||
the full dataset into multiple folds, and training models on each, metrics can
|
||
be calculated on each fold, and an average can be taken to provide a measure of
|
||
the system's performance over all folds. 10-fold cross validation, stratified
|
||
by class, was chosen for evaluation of the system. This provides an insight
|
||
into the performance of the algorithm across the dataset. This is a common
|
||
method used by all participants of the Physionet challenge and is commonly found
|
||
in prior literature.\\
|
||
It is highlighted by Homsi et.\ al, that a large amount of variance may be
|
||
observed across folds~\parencite[p.1637]{Homsi2017}. This is attributed to the
|
||
variations across databases, making generalisation difficult. To account for
|
||
this, it is suggested that cross-validation is repeated multiple times and
|
||
averaged to provide a more accurate measurement of performance across folds.
|
||
For the proposed system, cross validation was repeated 10 times for each fold
|
||
and averaged to produce the final results. Standard-deviation is also
|
||
calculated across these iterations to illustrate the possible prevalence of
|
||
this.\\
|
||
|
||
A common theme throughout the literature was that of generalisation across
|
||
databases. It was observed that many previous algorithms achieved high
|
||
accuracies in standard cross-validation, but performed significantly worse when
|
||
testing on unseen databases~\parencite{Homsi2017, Bobillo2016}. For this
|
||
reason, leave-one-out cross-validation was used to form a better understanding
|
||
of the system's ability to generalise to unseen data from different sources. On
|
||
each fold, a single database is removed, training on all other databases. This
|
||
is a useful method as it can be used to determine the level to which
|
||
information extracted from databases is representative of other databases.\\
|
||
|
||
The evaluation of models using cross-validation was not limited to final
|
||
evaluation. Evaluation of intermediate models generated by both the SFFS and Particle Swarm
|
||
algorithms was possible, by further separating the test set into test and
|
||
validation fold. This technique was used in the optimisation of the
|
||
final model to provide scores for intermediate models, as well as to help avoid
|
||
overfitting parameters and features to the training set used for
|
||
optimisation.\\
|
||
|
||
Discussion on the performance of the proposed system using these methods can be
|
||
found in Section~\ref{Eval}.
|
||
% TODO: Insert cross validation diagram from data science handbook
|
||
|
||
\section{Implementation}
|
||
This section describes the tools used in the realisation of the
|
||
proposed system, the practical issues encountered throughout the
|
||
implementation process and the development strategy taken to address and avoid
|
||
such issues. Rationale is given for decisions made throughout
|
||
production of the proposed system and any known issues with the current implementation are
|
||
outlined.
|
||
|
||
\subsection{Development strategy}
|
||
Early in the design process it became apparent that in order for this project
|
||
to produce reasonable results, it would need to utilise a number of complex
|
||
algorithms to handle the various non-trivial problems that were encountered
|
||
throughout development. Python was chosen as the most suitable language for the
|
||
implementation of the system. High level language features such as dynamic
|
||
types and automatic garbage collection, combined with the large variety of
|
||
readily available packages and libraries, make the language a good choice for
|
||
the fast, flexible development approach taken throughout this project.\\
|
||
|
||
The most significant objective from the outset of the project was to provide a
|
||
system that could classify pathological signals with a degree of accuracy that
|
||
was comparable to the current state of research in the field of PCG analysis.
|
||
Given this focus and that the performance of the final product was initially
|
||
unknown, it was recognised that the design of the project would need to adapt
|
||
as the project progressed, implementing and testing high level concepts to
|
||
iteratively improve performance of the system. For this reason a high level
|
||
view of production was taken, choosing to focus on the overall system
|
||
architecture, rather than spending great amounts of time on on any one specific
|
||
element of the project (as any component of the project could be
|
||
removed/replaced entirely, if this facilitated the improvement of results).\\
|
||
|
||
With this design ethos in mind, it was decided that external packages and
|
||
libraries would be used/adapted wherever necessary to avoid spending large
|
||
amount of time developing proprietary implementations of proven concepts. By
|
||
not `reinventing the wheel', it would be possible to rapidly prototype and
|
||
evaluate high level concepts, such as the variety of machine learning and
|
||
optimisation algorithms detailed in previous sections, quickly and effectively.
|
||
Due to it's active developer community, a wide range of scientific computing
|
||
and machine learning algorithms are available, such as
|
||
NumPy~\parencite{VanDerWalt2011}, SciPy~\parencite{Millman2011} and
|
||
Scikit-Learn~\parencite{Pedregosa2011}, each of which was used extensively
|
||
throughout the project alongside other packages detailed in the following
|
||
sections.
|
||
|
||
\subsection{System overview}
|
||
The proposed system can be broken down into 5 key components: the user
|
||
interface, feature generation module, classification module, optimisation
|
||
module and evaluation module. The overall architecture of the system follows a
|
||
common design pattern for machine learning based systems; Taking a set of input
|
||
data, augmenting to produce associated data, extracting patterns from said
|
||
data and evaluating to understand the effects of various implementation
|
||
choices. The reason for this maintain focus on the central innovations of the
|
||
project which are thought to be in the implementation of models and the
|
||
analysis of input data.
|
||
|
||
\subsubsection{User interface and project framework}
|
||
As a project focused on producing a functional proof of concept for a specific
|
||
problem, this project was not aimed at producing a user-facing application. For
|
||
this reason, interactions with the system are based primarily around a simple,
|
||
well documented commandline interface (CLI) that allows a user to specify
|
||
relevant parameters through use of CLI flags (CLI output flags are detailed in
|
||
Appendix~\ref{appendixB}). This gives the user relevant
|
||
control over optional parameters that are situation specific (such as
|
||
multiprocessing, specifying locations to save data etc\ldots) through an interface that is both easy to maintain throughout
|
||
development and intuitive for a user familiar with command line applications.
|
||
Relevant information is then presented to the user as the program progresses
|
||
through the use of standard output and file based logging systems. Providing
|
||
informative feedback to the user was essential, both during development for
|
||
debugging processes, and to provide an understanding of the programs progress,
|
||
particularly in long-running iterative processes used for optimisation.\\
|
||
A file based logging system was developed using Python's built-in logging
|
||
module to allow for the monitoring of threaded processes. This allowed for
|
||
detailed monitoring of the systems progress, even when running multiple
|
||
operations concurrently.\\
|
||
|
||
A significant issue that developed as the project grew in size and complexity
|
||
was the running time. As more complex methods were implemented for feature
|
||
extraction and model optimisation, the time taken to process the relatively
|
||
large dataset grew considerably. Primarily using Python's object pickling
|
||
functionality, Methods were implemented to handle the creation and maintenance
|
||
of intermediate feature and parameter files, allowing the program to load
|
||
previously generated data. This helped in cutting computation times of
|
||
subsequent runs significantly. This also provided a convenient facility for
|
||
storing models, in a portable fashion for transfer between computers, for
|
||
example.
|
||
|
||
\subsubsection{Features extraction}
|
||
As a fundamental component of the classification system, the feature extraction
|
||
method required careful planning and development to realise the extraction of
|
||
the large number of features. As a feature of particular importance, the
|
||
integration of the MATLAB segmentation algorithm was a key part of the feature
|
||
extraction method. Given a pre-made MATLAB implementation, the most problematic
|
||
step was the integration of this code seamlessly with the largely incompatible
|
||
Python framework. This was achieved through use of Python's Subprocess module,
|
||
allowing for the code to be called through an external unix process. The nature
|
||
of this implementation limits the compatibility of the system to unix only
|
||
systems. However, given the nature of the project, this is not considered an
|
||
issue.\\
|
||
|
||
Having extracted segmentation data, methods for the efficient calculation and
|
||
storage of other features were considered. It was recognised that in order to
|
||
maintain such a large number of features, an organised and efficient data
|
||
format would be needed. The popular Pandas DataFrame library was chosen for
|
||
this task. Built on the NumPy array object, DataFrames provide a labelled data
|
||
structure, ideal for storing large quantities of labelled data. As the DataFrame
|
||
is based on the Numpy array it is able to combine the powerful matrix
|
||
operations of Numpy with intuitive database queries commonly seen in languages
|
||
such as SQL. This data structure also allows for simplistic storing and loading
|
||
of DataFrames through built-in interfaces to the HDF5 file system, simplifying
|
||
the storage of large quantities of intermediate data.\\
|
||
A minor issue that was found during development was the lack of compatibility
|
||
between DataFrames and Scikit-Learn (described in Section~\ref{Sklearn}). This
|
||
simply required careful handling of operations between these APIs in order to
|
||
avoid unintentional mishandling of data.\\
|
||
|
||
As mentioned previously, the calculation of features often required many
|
||
mathematical operations to be performed on vectors. The use of Numpy array-like
|
||
containers simplified this operation, allowing many features to be calculated
|
||
in a single line of code. Standard operations were sufficient for a number of
|
||
the features. However, mathematically complex features such as MFCCs and
|
||
wavelet transforms required dedicated processing libraries. The open source
|
||
pyWavelets and Librosa project were used for initial generation of MFCCs and
|
||
the DWT respectively~\parencite{pyWave, Mcfee2015}. Further processing such as
|
||
segmentation, entropy analysis etc\ldots is calculated to produce the final
|
||
features. For a details of all features please refer to
|
||
Appendix~\ref{appendixA}.\\
|
||
|
||
Given the large number of operation required for feature extraction, a large
|
||
amount of time needed to compute features was an unavoidable consequence of
|
||
the design. To help alleviate this issue, processing of features was
|
||
parallelised, using each sample as an individual job. The speed-up aquired
|
||
through parellisation is inherently dependant on the system running the
|
||
program, however, this significantly reduced the computation time of features.
|
||
A modified implementation of Python's multiprocessing module was used for task
|
||
management.
|
||
|
||
\subsubsection{Classification model generation}\label{Sklearn}
|
||
The Scikit-learn machine learning package is a popular choice for
|
||
implementation of machine learning algorithms in
|
||
Python.~\parencite{Pedregosa2011} As a result, it is well maintained and
|
||
benefits from a number of smaller projects that are designed to be compatible
|
||
with it's API. Such projects include Mlxtend, designed to expand Scikit-learns
|
||
range of features with more esoteric models and tools~\parencite{Raschka2016,}
|
||
and Imbalance-learn, which addresses the lack of sample balancing
|
||
transformations in Scikit-learn~\parencite{Lemaitre2017}. The project's aim to
|
||
provide a standard interface to each of it's algorithms also aids in the quick
|
||
prototyping of many machine learning algorithms on the features generated.
|
||
During the initial stages of development, this allowed for models to be tested
|
||
quickly to gain a rudimentary understanding of the performance of models. From
|
||
this, it was possible to explore more complex models and ultimately resulted in
|
||
the use of Mlxtend's Stacking ensemble classifier.\\
|
||
|
||
As the size of the classification codebase grew, so did the number of chained
|
||
operations in the classification process. Scikit-learn's Pipeline utility was
|
||
used to maintain correct handling of these process chains, allowing for models
|
||
to be easily formed of multiple transforms and classifiers in a manageable
|
||
manner. This gained particular significance with the implementation of model
|
||
optimisation methods, as discussed in the following section.
|
||
|
||
\paragraph{Model selection/optimisation}\label{ModOp}
|
||
The decision to use an ensemble classifier also presented a complication,
|
||
through the need to choose multiple complimentary base classifiers. A further
|
||
issue was in the growing number of parameters that would need to be tuned to
|
||
obtain optimal results from each of the chosen base classifiers. The adaption
|
||
of a particle swarm optimisation algorithm was found to be an effective
|
||
solution to both of these problems~\parencite{Claesen2014}. Using model
|
||
pipelines to encapsulate all transformations and classifiers into a single
|
||
object, it was possible to construct a dictionary of multiple potential
|
||
Scikit-learn models for each base classifier. Implementation of a wrapper
|
||
function around the full stacking classifier then allowed classifier pipelines
|
||
to be treated as hyperparameters, which could be optimized on a training set
|
||
alongside all active parameters for each base classifier. This resulted in the
|
||
simultaneous selection and optimization of model combinations and
|
||
hyperparameters.\\
|
||
|
||
Mlxtend's implementation of SFFS was also implemented to apply feature reduction to
|
||
the dataset. This was initially implemented as part of the processing pipeline,
|
||
resulting in feature reduction being applied on every iteration of parameter
|
||
optimisation. This initially provided a score for each model based on the
|
||
selection of it's optimal parameters. However, as models grew in complexity,
|
||
the growing computational complexity resulted in this method becoming infeasible.
|
||
This was addressed by re-implementing the feature selection algorithm after
|
||
optimisation on the full dataset. The reduction in performance from this is not
|
||
thought to be significant.\\
|
||
|
||
As with parameters, models were saved using a combination of Python's pickling
|
||
functionality and Panda's HDF5 export methods, to create fully portable models.
|
||
|
||
\subsubsection{Automatic system evaluation}
|
||
In order to accurately place the system in the context of current research,
|
||
evaluation metrics were needed to perform automatic testing of the system.
|
||
Metrics were implemented as described in Section~\ref{metrics} using a custom
|
||
multi-scorer object that was adapted to allow for the calculation of the 3
|
||
metrics: sensitivity, specificity and score. Using this object in conjunction
|
||
with a selection of Scikit-learns cross-validation objects provided a mechanism
|
||
for quickly evaluating models in an equivalent fashion to those presented in
|
||
the literature. In addition, a simple test train split was implemented allowing
|
||
for the use of a hold-out dataset. Performance on this dataset was evaluated
|
||
directly using the custom scoring functions and Scikit-learn's model scoring
|
||
methods.\\
|
||
Finally, results were formatted into tables and logged to provide instant
|
||
feedback to the user on the performance of the current model.
|
||
|
||
\section{Results}\label{Eval}
|
||
The system was evaluated using 3 primary scoring methods (as described in
|
||
Section~\ref{metrics}).
|
||
\begin{itemize}
|
||
\item Score on hidden test set
|
||
\item Leave-one-out database cross-validation
|
||
\item 10-fold stratified cross-validation
|
||
\end{itemize}
|
||
|
||
The final optimised model was generated using a total of 43 selected features
|
||
out of a possible 188. Parameter optimisation was run with 1000 parameter
|
||
evaluations, resulting in 50 iterations using 20 particles. Final parameters
|
||
and selected features
|
||
for the chosen algorithms are detailed in table~\ref{OpParam}.\\
|
||
The final scores produced for this model, evaluated using the full dataset, can
|
||
be found in Table~\ref{TestSet}. Scores for Leave-one-out cross-validation and
|
||
10-fold cross-validation can be seen in Figure~\ref{fig1} and Figure~\ref{fig2}
|
||
respectively. Details can be found in Appendix~\ref{appendixD}.
|
||
|
||
\begin{table}[H]
|
||
\centering
|
||
\caption{Hidden test-set scoring}
|
||
\label{TestSet}
|
||
\begin{tabular}{@{}lll@{}}
|
||
\toprule
|
||
$Acc$ & $Se$ & $Sp$ \\ \midrule
|
||
80.31\% & 83.15\% & 77.47\% \\ \bottomrule
|
||
\end{tabular}
|
||
\end{table}
|
||
|
||
% Make lists without bullets and compact spacing
|
||
\renewenvironment{itemize}{
|
||
\begin{list}{}{
|
||
\setlength{\leftmargin}{1.5em}
|
||
\setlength{\itemsep}{0.25em}
|
||
\setlength{\parskip}{0pt}
|
||
\setlength{\parsep}{0.25em}
|
||
}
|
||
}{
|
||
\end{list}
|
||
}
|
||
\setlist[enumerate]{itemsep=0.25em}
|
||
|
||
\begin{table}[H]
|
||
\centering
|
||
\caption{Optimised model parameters and selected features}
|
||
\scriptsize
|
||
\label{OpParam}
|
||
\begin{tabulary}{\linewidth}{LLLL}
|
||
\toprule
|
||
Base Classifier A & Base Classifier B & Base Classifier C & Meta Classifier \\ \midrule
|
||
Model: Linear SVM & Model: RBF Kernel SVM & Model: Naive-Bayes & Model: Logistic Regression \\
|
||
C: 4.2507 & C: 4.9452 & & C: 14.3611 \\
|
||
& Gamma: 0.4558 & & Penalty: L2 \\ \bottomrule
|
||
\end{tabulary}
|
||
\singlespacing
|
||
\begin{multicols}{5}
|
||
\scriptsize
|
||
\begin{itemize}
|
||
\item AvrA5diaShan
|
||
\item AvrA5s1Shan
|
||
\item AvrA5s2Shan
|
||
\item s2MFCC0
|
||
\item AvrD1s2Shan
|
||
\item s2MFCC12
|
||
\item s2MFCC4
|
||
\item s2MFCC6
|
||
\item s2MFCC9
|
||
\item s2Max
|
||
\item s2Mean
|
||
\item AvrD4s2Shan
|
||
\item AvrD5s2Shan
|
||
\item s2ZeroX
|
||
\item sd\_RR
|
||
\item TPTs1
|
||
\item s2Dur
|
||
\item sysMFCC2
|
||
\item sysMax
|
||
\item sysSampEnt
|
||
\item sysShanEngy
|
||
\item sysSkew
|
||
\item sysVar
|
||
\item diaMFCC11
|
||
\item diaMFCC12
|
||
\item diaMFCC6
|
||
\item diaSampEnt
|
||
\item diaShanEngy
|
||
\item diaVar
|
||
\item diaZeroX
|
||
\item heartRate
|
||
\item m\_RR
|
||
\item m\_Ratio\_DiaRR
|
||
\item mean\_IntS1
|
||
\item mean\_IntSys
|
||
\item s1Dur
|
||
\item s1MFCC11
|
||
\item s1MFCC4
|
||
\item s1Max
|
||
\item s1Mean
|
||
\item s1ShanEngy
|
||
\item s1Var
|
||
\item s1ZeroX
|
||
\end{itemize}
|
||
\end{multicols}
|
||
\end{table}
|
||
\begin{figure}[H]
|
||
\caption{Leave-one-out cross-validation results (mean and std-dev)}
|
||
\makebox[\textwidth]{\includegraphics[width=1.1\textwidth]{logo}}
|
||
\label{fig1}
|
||
\end{figure}
|
||
\begin{figure}[H]
|
||
\caption{Stratified 10-fold cross-validation results (mean and std-dev)}
|
||
\makebox[\textwidth]{\includegraphics[width=\textwidth]{10_fold}}
|
||
\label{fig2}
|
||
\end{figure}
|
||
Due to the replication of the approach taken for scoring entries to the Physionet
|
||
challenge, it is possible to directly compare results to challenge entries.
|
||
This aims to provide a thorough understanding of the performance of the
|
||
proposed system in relation to other approaches. The system is further compared
|
||
to some successful algorithms prior to the challenge in the subsequent section,
|
||
in order to understand the performance of the system in a wider context of
|
||
heart sound analysis.\\
|
||
|
||
The most directly comparable results are to those presented by challenge
|
||
participants, used during the training of their algorithms. Many participants
|
||
used similar cross-validation scores to determine the performance of their
|
||
algorithm before testing on the final hidden dataset, and these provide a key
|
||
insight into the performance with regard to a variety of aspects.\\
|
||
|
||
Results obtained using the Leave-one-out cross-validation scoring are similar
|
||
to those of the highest scoring algorithms in the
|
||
challenge~\parencite{Homsi2017, Bobillo2016}. As a measure for performance on
|
||
unseen data, this suggests that the proposed algorithm generalises to a similar
|
||
degree. However, it is clear that algorithms generally score poorly in this
|
||
area. This is the general consensus across many of the algorithms presented for
|
||
the challenge and is a problem that requires further work. Higher scores in
|
||
10-fold cross validation than those of Leave-one-out cross-validation further
|
||
suggest that the algorithm is highly susceptible to degraded results, most
|
||
likely as a consequence of signal qualities varying from those of the training
|
||
set.\\
|
||
In addition, scores were also applied to the algorithm after balancing the full
|
||
database. The aim of this was to remove class imbalance across the training
|
||
and test set, to gain an understanding of how the model performs on each class
|
||
equally. Results of these tests can be viewed in Appendix~\ref{appendixC}. It
|
||
was found that, although hidden test set and 10-fold cross-validation scores
|
||
aren't affected by class imbalance, there is a significant increase in the overall
|
||
leave-one-out cross-validation score from 54.16\% to 66.13\%. This is currently
|
||
thought to be caused by the model not resampling by database during training.
|
||
As resampling during training does not maintain the balance between datasets, a
|
||
disproportionate number of normal samples from larger datasets is likely to be
|
||
present in the training set. This may cause the model to overfit this quality
|
||
of sample recording, resulting in poor classification on the left out test set.
|
||
It should however be noted that the significant reduction in samples caused by
|
||
the resampling makes fitting a model that generalises to other databases easier,
|
||
thus generating inflated results. However, when comparing the balanced database
|
||
results to those of Kay et al., overall performance is still found to be
|
||
significantly better than that of their proposed algorithm, despite use of a
|
||
larger balanced dataset~\parencite{Kay2017}. This is an area that requires
|
||
further investigation, and could potentially improve results of the algorithm
|
||
considerably.
|
||
|
||
A further issue that has been highlighted throughout previous literature, is
|
||
the use of different recording equipment for the recording of normal and
|
||
pathological signals in database $e$~\parencite{Kay2017, Bobillo2016}. It is
|
||
suggested that this may have a significant impact on results, as an algorithm
|
||
may inadvertently learn the difference in noise characteristics of signals,
|
||
rather than of the heart sounds themselves. This is of particular importance as
|
||
database $e$ is the largest available dataset. This issue is found in results
|
||
through examination of database $e$ in participant's leave-one-out
|
||
cross-validation scores. This may also be true in the case of the proposed
|
||
system as database $e$ has shown considerably higher specificity in results
|
||
than those the other database, both in balanced and unbalanced datasets.
|
||
Further would be needed to understand the extent of the effect that this has on
|
||
the performance of the proposed system.\\
|
||
|
||
The final 10-fold cross-validation score was found to be between, 2 and 12\%
|
||
less than those of the highest scoring models~\parencite{Zabihi2016, Homsi2017,
|
||
Kay2017}. This suggests that, when presented with signals similar to those
|
||
presented in training, the proposed model is not able to find distinguishable
|
||
patterns between healthy and pathological signal to the same degree as the
|
||
leading algorithms in the field. However, accuracy scores of 80\% show promise
|
||
for this method. Sensitivity and specificity scores are roughly equal,
|
||
suggesting little bias between normal and abnormal classes. The method also
|
||
performed equally well on the hidden test set. The hidden test set score is
|
||
the only score based on predictions where no samples had previously been seen
|
||
by the algorithm during optimisation. A similar score to that of 10-fold cross
|
||
validation suggests that chosen features and hyperparameters generalise well to
|
||
unseen data. If scores in cross validation had been significantly higher than
|
||
that of the hidden test set, it would suggest that the model is tuning
|
||
parameters and features in a way that only benefits the score of the training
|
||
set. These scores suggest that the model is reasonably robust, and that with
|
||
improvements detailed in section~\ref{FutureWork} there is considerable
|
||
potential for this method.\\
|
||
|
||
Some of the most successful algorithms prior to the challenge also achieve
|
||
similar levels of success (As detailed in Table~\ref{PriorWorkTable}, although
|
||
these methods were evaluated on different datasets, so are not as directly
|
||
comparable)~\parencite{Ari2010, Quiceno-Manrique2010a, Maglogiannis2009}. This
|
||
clearly shows that high levels of accuracy are attainable through machine
|
||
learning methods for PCG classification. In addition these methods also venture
|
||
further into the area of classification for specific pathologies, a topic not
|
||
widely considered for the challenge.
|
||
|
||
|
||
\section{Discussion and further work}\label{FutureWork}
|
||
The current implementation of the system has provided promising results,
|
||
suggesting that the combination of techniques is well suited to the task of
|
||
abnormality detection. It is clear however, that further development of the
|
||
system could improve results further. This section defines some of the
|
||
recognised issues that could be addressed in each of the system's components,
|
||
in order to build on the work of the project in it's current form. Areas of
|
||
interest are also outlined to suggest potential avenues for building on the
|
||
system's current functionality.\\
|
||
|
||
A potential flaw in the current project is that beyond segmentation of the
|
||
original signal, pre-processing (and other components of the system) currently
|
||
make little use of biomedical domain knowledge to aid in processing of the
|
||
input data. This is largely due to the author's lack of background in this
|
||
area, prior to development of this project. An example of a project that has
|
||
implemented this is the work by Goda et al.\ who, by recognising that trained professionals
|
||
can classify most heart conditions, given at least 5 seconds of audio, was able to
|
||
further segment audio in 5 second overlapping segments, essentially providing
|
||
additional atomic samples for training~\parencite{Goda2016}. It is thought that
|
||
other such assumptions based on physiological understanding could be made in
|
||
order to further enhance the system.\\
|
||
|
||
Another area that requires further attention is the implementation of the resampling
|
||
method, as discussed in Section~\ref{Eval}. It has been shown that there is a
|
||
significant possibility for improving leave-one-out cross-validation scores by
|
||
implementing resampling per-database. In terms of effort, this would be a
|
||
relatively simple implementation and was not included in the project's current
|
||
form primarily due to comprehension of the issue late in the project. It is
|
||
hoped that this would produce considerable improvements in the generalisation
|
||
performance of the project. In addition to this, resampling could conceivably
|
||
be integrated in the hyperparameter optimisation method, in a similar fashion
|
||
to the models themselves, allowing for selection of the resampling methods
|
||
dynamically. This has the potential to improve accuracy of the classifier by
|
||
providing an optimal selection of training data for the task.\\
|
||
|
||
The combination of hyperparameter and model optimisation was regarded as one of
|
||
the key reasons for it's success. The current method, although sufficient for
|
||
the task, may be further improved through deeper understanding of the models,
|
||
the interaction of the models in stacking classifiers, and the parameter
|
||
optimisation method.\\
|
||
For example, in the final selection of models, a linear SVM, RBF kernel SVM and
|
||
Naive Bayes models were chosen by the system. From intuition it is thought that
|
||
the reason these worked well is due to the complex combination of linear and
|
||
non-linear relationships in the input features. As the RBF kernel is well
|
||
suited to differentiating non linear patterns, and the linear SVM is well suited
|
||
for linear patterns, these models would in theory compliment one another. This
|
||
is also true of the Naive-bayes model, which considers each feature in
|
||
isolation from all others, contrasting the complex inter-feature relationships
|
||
(such as those most likely present in the MFCC and wavelet coefficients, for
|
||
example) considered by the other models. Given this intuition, more targeted
|
||
models could be provided to the dictionary for model selection, allowing for
|
||
more efficient convergence of the optimisaton algorithm. This theory is
|
||
however, simply conjecture at the current stage, and would require further
|
||
research to form a better understanding of the problem.\\
|
||
Building on this, it is known that a wide variety of optimisation methods, such
|
||
as genetic algorithms and swarm algorithms~\parencite[p.32]{Sesmero2015}.
|
||
Deeper understanding of these may yeild a more suitable optimisation method
|
||
than the tradition particle swarm method used in the current implementation.\\
|
||
|
||
The features in the current project clearly provided a reasonable
|
||
representation of the signal, suitable for the classification task. This was
|
||
largely due to the selective procurement of these features from literature,
|
||
where they had been demonstrated to provide useful representations of audio
|
||
both for PCG signals and in a more general context. Further analysis of
|
||
individual features and the feature set as a whole may allow for a more refined
|
||
feature set to be developed. This is in contrast to the current methodology
|
||
which largely relies on the trial and error approach of SFFS. A deeper
|
||
understanding may allow for more complex and context specific features to be
|
||
added to the feature set, and for other ineffective features to be removed.\\ An
|
||
example of this is the implementation of MFCC features. Although the current
|
||
MFCCs were seen to be relevant features, the coefficients are used directly in
|
||
a rather simplistic fashion, without any further processing. Other
|
||
implementations have shown that further processing could be used to augment
|
||
these features~\parencite{Zabihi2016}, providing additional potentially relevant
|
||
statistical data to the feature-set.\\
|
||
|
||
An application of the system that has not been considered in any great depth is
|
||
the further suitability of the method for more specific cardiovascular
|
||
classification tasks. A number of projects prior to the challenge have
|
||
demonstrated the success of methods similar to the proposed method in this
|
||
area~\parencite{Ari2010, Quiceno-Manrique2010a}. The work of Ari et al.\ is
|
||
of particular interest, as it demonstrated a reasonable general classifier
|
||
using similar SVM based classifiers to that of the current project, but also
|
||
aimed to further classify the signal using subsequent nested classifiers. This
|
||
method could be applied to potentially develop a highly useful diagnostic tool.\\
|
||
|
||
Given the quality of results presented, it is believed that this project has
|
||
successfully demonstrated the potential for heuristically optimised stacking
|
||
classifiers to detect pathological heart sounds. With additional improvements,
|
||
it is thought that the proposed methodology could be built upon, potentially
|
||
providing an innovative alternative solution to the currently used methods for
|
||
heart sound analysis.
|
||
|
||
\pagebreak
|
||
|
||
\appendix
|
||
\section*{Appendices}
|
||
\addcontentsline{toc}{section}{Appendices}
|
||
\renewcommand{\thesubsection}{\Alph{subsection}}
|
||
|
||
\subsection{Table of Features}\label{appendixA}
|
||
\begin{table}[H]
|
||
\centering
|
||
\caption{Description of features}
|
||
\onehalfspacing
|
||
\tiny
|
||
\label{my-label}
|
||
\begin{tabulary}{\linewidth}{LLLLL}
|
||
\hline
|
||
Tag & Feature Name & Description \\ \hline
|
||
heartRate & Heart Rate & The number of beats per minute (BPM) \\
|
||
m\_RR & Mean RR Interval & Average length of heart cycles \\
|
||
sd\_RR & Std-dev RR Interval & Standard deviation of heart cycles \\
|
||
mean\_Int* & Mean Segment Interval & Average length of segment \\
|
||
sd\_Int* & Std-dev Segment Interval & Standard deviation of segment \\
|
||
R\_SysRR & Systolic-RR Ratio & Ratio of Systolic interval to RR interval \\
|
||
R\_DiaRR & Diastolic-RR Ratio & Ratio of Diastolic interval to RR interval \\
|
||
R\_SysDia & Ratio of Systolic-RR/Diastolic-RR Ratios & Ratio of above ratios \\
|
||
*ZeroX & Zero-crossing & Zero-crossing rate of a segment \\
|
||
*RMS & Root Mean Square & The Root Mean Square of a segment \\
|
||
*ShanEngy & Shannon Energy & The Averaged Shannon Energy Envelope of a segment \\
|
||
*Dur & Duration & The duration of a segment \\
|
||
*Max & Max & The peak value of an absolute segment \\
|
||
*Mean & Mean & The mean value of a segment \\
|
||
*Skew & Skewness & The temporal skewness of a segment \\
|
||
*Kurt & Kurtosis & The temporal kurosis of a segment \\
|
||
*Var & Variance & The variance of a segment \\
|
||
*SampEnt & Sample Entropy & The sample entropy of a segment \\
|
||
*ShanEnt & Shannon Entropy & The Shannon Entropy of a segment \\
|
||
TPT* & Total Power (time) & The total power of a segment in the time domain \\
|
||
TPF* & Total Power (frequency) & The total power of a segment in the frequency domain \\
|
||
*Flat & Spectral Flatness & The flatness of a segment's frequency spectrum \\
|
||
*Cent & Spectral Centroid & The centroid of a segment's frequency spectrum \\
|
||
*Spread & Spectral Spread & The spread of a segment's frequency spectrum \\
|
||
*MFCC\{$n$\} & Mel-frequency Cepstrum Coefficients & MFCC coefficient number $n$, where $n = \{1, \ldots, 13\}$ \\
|
||
m\_Ratio\_SysRR & Mean RR/Systole interval ratio & Mean value of the interval ratios between systole and RR in each heart beat \\
|
||
sd\_Ratio\_SysRR & Std-dev RR/Systole interval ratio & Std-dev value of the interval ratios between systole and RR in each heart beat \\
|
||
m\_Ratio\_DiaRR & Mean RR/Diastole interval ratio & Mean value of the interval ratios between diastole and RR in each heart beat \\
|
||
sd\_Ratio\_DiaRR & Std-dev RR/Diastole interval ratio & Std-dev value of the interval ratios between diastole and RR in each heart beat \\
|
||
m\_Ratio\_SysDia & Mean Systole/Diastole interval ratio & Mean value of the interval ratios between systole and diastole in each heart beat \\
|
||
sd\_Ratio\_SysDia & Std-dev Systole/Diastole interval ratio & Std-dev value of the interval ratios between systole and diastole in each heart beat \\
|
||
D[1-5]Shan & Detail coefficient shannon entropy & Shannon entropy of DWT detail coefficient 1-5 \\
|
||
A5Shan & Approximation coefficient shannon entropy & Shannon entropy of DWT approximation coefficient 1-5 \\
|
||
\mbox{TotD[1-5]*Shan} & Total detail coefficient shannon entropy & Total Shannon entropy of DWT detail coefficient 1-5 across signal \\
|
||
TotA5*Shan & Total approximation coefficient shannon entropy & Total Shannon entropy of DWT approximation coefficient 1-5 across signal \\ \hline
|
||
\end{tabulary}
|
||
\justifying
|
||
\scriptsize
|
||
Feature sources include:~\parencite{Homsi2016, Schmidt2015, Liang1998,
|
||
Lerch2012}\\
|
||
`*' --- denotes feature is applied to S1, systolic, S2 and diastolic segments
|
||
respectively.
|
||
\end{table}
|
||
\pagebreak
|
||
|
||
|
||
\subsection{Commandline Interface}\label{appendixB}
|
||
\singlespacing
|
||
\lstset{basicstyle=\tiny, style=mystyle}
|
||
\begin{lstlisting}[numbers=none]
|
||
usage: main.py [-h] [--features-fname OUTFNAME] [--segment] [--optimize]
|
||
[--eval EVAL] [--select-features SELECT\_FEATURES] [--backward]
|
||
[--parameters_fname OUTFNAME] [--fs_fname OUTFNAME]
|
||
[--no-parallel] [--reanalyse] [--verbose]
|
||
[--resample-mix RESAMPLE_MIX] [--keep-logs]
|
||
TESTDIR OUTDIR
|
||
|
||
Script for the classification of PCG data.
|
||
|
||
positional arguments:
|
||
TESTDIR Directory of test data to train the system
|
||
OUTDIR Directory to store output analyses
|
||
|
||
optional arguments:
|
||
-h, --help show this help message and exit
|
||
--features-fname, -o
|
||
Specify the name of the file to save generated
|
||
features to for future use
|
||
--segment Run Matlab segmentation script to create segmentation
|
||
analysis
|
||
--optimize Run optimization algorithm to find best model and
|
||
parameters for classifier
|
||
--eval, -e Number of evaluation to pass to the particle swarm
|
||
optimization
|
||
--select-features
|
||
Run feature selection algorithm to find best features
|
||
for model, either selecting or reducing features by
|
||
the integer specified. This depends on use of
|
||
--backward flag, to determine forward or backward
|
||
feature selection. (a value of 0 skips feature
|
||
selection entirely, using previously generated
|
||
features if available. A value less than 0 uses all
|
||
available features.)
|
||
--backward, -b Runs backward feature selection as opposed to default
|
||
forward selection.
|
||
--parameters_fname
|
||
Specify the name of the file to save generated
|
||
features to for future use
|
||
--fs_fname Specify the name of the file to save generated feature
|
||
selection model to for future use
|
||
--no-parallel, -p Disable processing in parallel. (Will likely decrease
|
||
performance but may aid in debugging)
|
||
--reanalyse Force regeneration of database features
|
||
--verbose, -v Specifies level of verbosity in output. For example:
|
||
'-vvvvv' will output all information. '-v' will output
|
||
minimal information.
|
||
--keep-logs Keep previously generated logs that aren't overwritten
|
||
by current process
|
||
\end{lstlisting}
|
||
\doublespacing
|
||
\pagebreak{}
|
||
|
||
\subsection{Final results}\label{appendixD}
|
||
Results of of tests on final optimised model\\
|
||
Leave-one-out scores are shown in Table~\ref{LOGO}\\
|
||
Stratified cross-validation scores can be found in Table~\ref{KFCV}\\
|
||
|
||
\begin{table}[H]
|
||
\doublespacing
|
||
\caption{Leave-one-out scores}
|
||
\label{LOGO}
|
||
\footnotesize
|
||
All scores are an average of 10 iterations $\pm$ standard-deviation
|
||
\scriptsize
|
||
\centering
|
||
\begin{tabulary}{\linewidth}{LCCCCCCC}
|
||
\toprule
|
||
& A & B & C & D & E & F & Mean \\ \midrule
|
||
$Acc$ & $0.5395\pm0.0104$ & $0.4896\pm0.0129$ & $0.5673\pm0.0298$ & $0.5173\pm0.0223$ & $0.5869\pm0.0300$ & $0.5492\pm0.0140$ & $0.5416\pm0.0318$ \\
|
||
$Se$ & $0.7281\pm0.0164$ & $0.8664\pm0.0240$ & $0.6775\pm0.0208$ & $0.7865\pm0.0218$ & $0.5397\pm0.0459$ & $0.7387\pm0.0493$ & $0.7228\pm0.1005$ \\
|
||
$Sp$ & $0.3509\pm0.0264$ & $0.1127\pm0.012$ & $0.4571\pm0.0571$ & $0.2481\pm0.0416$ & $0.6340\pm0.0387$ & $0.3596\pm0.0464$ & $0.3604\pm0.1624$ \\ \bottomrule
|
||
\end{tabulary}
|
||
\end{table}
|
||
|
||
\begin{table}[H]
|
||
\caption{10-fold cross-validation score}
|
||
\footnotesize
|
||
All scores are an average of 10 iterations $\pm$ standard-deviation
|
||
\doublespacing
|
||
\label{KFCV}
|
||
\scriptsize
|
||
\centering
|
||
\begin{tabulary}{\linewidth}{LCCCCCCCCCCC}
|
||
\toprule
|
||
& 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10 & Mean \\ \midrule
|
||
$Acc$ & $0.7969\pm0.0246$ & $0.8049\pm0.0244$ & $0.8043\pm0.0153$ & $0.8111\pm0.0295$ & $0.8095\pm0.0261$ & $0.7999\pm0.0208$ & $0.8061\pm0.0299$ & $0.8150\pm0.0198$ & $0.8140\pm0.0245$ & $0.7928\pm0.0224$ & $0.8055\pm0.0069$ \\
|
||
$Se$ & $0.8121\pm0.0420$ & $0.8164\pm0.0360$ & $0.8193\pm0.0302$ & $0.8184\pm0.0634$ & $0.8158\pm0.0484$ & $0.8061\pm0.0438$ & $0.8325\pm0.0546$ & $0.8421\pm0.0321$ & $0.8246\pm0.0474$ & $0.7798\pm0.0302$ & $0.8167\pm0.0157$ \\
|
||
$Sp$ & $0.7818\pm0.0293$ & $0.7935\pm0.0267$ & $0.7894\pm0.0208$ & $0.8037\pm0.0280$ & $0.8033\pm0.0226$ & $0.7937\pm0.0214$ & $0.7798\pm0.0229$ & $0.7878\pm0.0206$ & $0.8035\pm0.0219$ & $0.8059\pm0.0228$ & $0.7942\pm0.0091$ \\ \bottomrule
|
||
\end{tabulary}
|
||
\end{table}
|
||
|
||
|
||
\pagebreak
|
||
\subsection{Balanced dataset test results}\label{appendixC}
|
||
Results of testing database using a resampled, balanced dataset.\\
|
||
Dataset was resampled by database, using jacknife resampling (Sampling without
|
||
replacement) and consisted of a total of 944 samples.
|
||
\begin{table}[H]
|
||
\centering
|
||
\caption{Hidden test-set scoring}
|
||
\begin{tabular}{@{}lll@{}}
|
||
\toprule
|
||
$Acc$ & $Se$ & $Sp$ \\ \midrule
|
||
80.77\% & 79.41\% & 82.14\% \\ \bottomrule
|
||
\end{tabular}
|
||
\end{table}
|
||
|
||
\begin{table}[H]
|
||
\doublespacing
|
||
\caption{Leave-one-out scores}
|
||
\footnotesize
|
||
All scores are an average of 10 iterations $\pm$ standard-deviation
|
||
\scriptsize
|
||
\centering
|
||
\begin{tabulary}{\linewidth}{LCCCCCCC}
|
||
\toprule
|
||
& A & B & C & D & E & F & Mean \\ \midrule
|
||
$Acc$ & $0.5784\pm0.0153$ & $0.6062\pm0.0131$ & $0.8737\pm0.0131$ & $0.5939\pm0.0302$ & $0.7022\pm0.0136$ & $0.6130\pm0.0189$ & $0.6613\pm0.1029$ \\
|
||
$Se$ & $0.4984\pm0.0225$ & $0.8816\pm0.0000$ & $0.7475\pm0.0261$ & $0.6692\pm0.0319$ & $0.6417\pm0.0227$ & $0.7290\pm0.0636$ & $0.6946\pm0.1161$ \\
|
||
$Sp$ & $0.6585\pm0.0342$ & $0.3309\pm0.0262$ & $1.0000\pm0.0000$ & $0.5185\pm0.0741$ & $0.7628\pm0.0134$ & $0.4971\pm0.0509$ & $0.6280\pm0.2141$ \\ \bottomrule
|
||
\end{tabulary}
|
||
\end{table}
|
||
|
||
\begin{table}[H]
|
||
\caption{10-fold cross-validation score}
|
||
\footnotesize
|
||
All scores are an average of 10 iterations $\pm$ standard-deviation
|
||
\doublespacing
|
||
\scriptsize
|
||
\centering
|
||
\begin{tabulary}{\linewidth}{LCCCCCCCCCCC}
|
||
\toprule
|
||
& 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10 & Mean \\ \midrule
|
||
$Acc$ & $0.7961\pm0.0364$ & $0.8187\pm0.0384$ & $0.8108\pm0.0238$ & $0.8019\pm0.0316$ & $0.8103\pm0.0326$ & $0.8217\pm0.0417$ & $0.7845\pm0.0593$ & $0.8053\pm0.0262$ & $0.8023\pm0.0148$ & $0.8105\pm0.0312$ & $0.8062\pm0.0103$ \\
|
||
$Se$ & $0.7922\pm0.0603$ & $0.8060\pm0.0453$ & $0.7853\pm0.0363$ & $0.7772\pm0.0489$ & $0.8044\pm0.0504$ & $0.8070\pm0.0538$ & $0.7553\pm0.0700$ & $0.7623\pm0.0312$ & $0.7772\pm0.0272$ & $0.8088\pm0.0461$ & $0.7876\pm0.0184$ \\
|
||
$Sp$ & $0.8000\pm0.0360$ & $0.8315\pm0.0504$ & $0.8363\pm0.0289$ & $0.8266\pm0.0366$ & $0.8161\pm0.0351$ & $0.8363\pm0.0627$ & $0.8137\pm0.0578$ & $0.8484\pm0.0380$ & $0.8274\pm0.0284$ & $0.8123\pm0.0412$ & $0.8249\pm0.0136$ \\ \bottomrule
|
||
\end{tabulary}
|
||
\end{table}
|
||
\pagebreak
|
||
\printbibliography{}
|
||
|
||
\end{document}
|