Added sklearn to install script

This commit is contained in:
2016-04-14 14:25:10 +01:00
parent d9d64e5b02
commit e35ddae1f5
6 changed files with 384 additions and 58 deletions
+1
View File
@@ -5,3 +5,4 @@ pip install pysndfile
pip install h5py pip install h5py
pip install https://github.com/Pezz89/fileops/zipball/master pip install https://github.com/Pezz89/fileops/zipball/master
pip install -e ./ pip install -e ./
pip install sklearn
+122 -6
View File
@@ -1,23 +1,139 @@
# Specify analysis parameters for root mean square analysis.
rms = { rms = {
# Analysis window sizes can be changed for each analysis individually.
# These do not need to match the grain size of the matcher or synthesis.
"window_size": 100, "window_size": 100,
"overlap": 2, "overlap": 8,
} }
f0 = {
"window_size": 4096,
"overlap": 8,
# Currently all frames below this ratio are digaurded and left as silence.
# Different databases will require different values for the best results.
# Noisier databases will need lower values than more tonal databases.
"ratio_threshold": 0.45
}
# Specify analysis parameters for variance analysis.
variance = {
"window_size": 100,
"overlap": 8
}
# Specify analysis parameters for temporal kurtosis analysis.
kurtosis = {
"window_size": 100,
"overlap": 8
}
# Specify analysis parameters for temporal skewness analysis.
skewness = {
"window_size": 100,
"overlap": 8
}
# Specify analysis parameters for FFT analysis.
fft = {
# The FFT window size determines the window size for all spectral analyses.
"window_size": 4096
}
database = {
# Enables creation of symbolic links to files not in the database rather
# than making pysical copies.
"symlink": True
}
# Sets the weighting for each analysis. a higher weighting gives an analysis
# higher presendence when finding the best matches.
matcher_weightings = {
"f0" : 0.5,
"spccntr" : 1.,
"spcsprd" : 1.,
"spcflux" : 3.,
"spccf" : 3.,
"spcflatness": 3.,
"zerox" : 1.,
"rms" : 0.1,
"peak": 0.1,
"centroid": 0.5,
"kurtosis": 2.,
"skewness": 2.,
"variance": 0.,
"harm_ratio": 2
}
# Specifies the method for averaging analysis frames to create a single value
# for comparing to other grains. Possible formatters are: 'mean', 'median',
# 'log2_mean', 'log2_median'
analysis_dict = { analysis_dict = {
# log2_median formats using mel scale. This is useful for analyses such as
# F0.
"f0": "log2_median", "f0": "log2_median",
"rms": "mean" "rms": "mean",
"zerox": "mean",
"spccntr": "median",
"spcsprd": "median",
"spcflux": "median",
"spccf": "median",
"spcflatness": "median",
"peak": "mean",
"centroid": "mean",
"kurtosis": "mean",
"skewness": "mean",
"variance": "mean",
"harm_ratio": "mean"
} }
analysis = { analysis = {
# Force the deletion of any pre-existing analyses to create new ones. This
# is needed for overwriting old analyses generated with different
# parameters to the current ones.
"reanalyse": False "reanalyse": False
} }
matcher = {
# Force the re-matching of analyses
"rematch": False,
# This value must be the same as the synthesis grain size to avoid the
# speeding up or slowing down of the resulting file in relation to the
# original.
"grain_size": 100,
"overlap": 8,
# Defines the number of matches to keep for synthesis. Note that this must
# also be specified in the synthesis config
"match_quantity": 5,
# Choose the algorithm used to perform matching. kdtree is recommended for
# larger datasets.
"method": 'kdtree'
}
synthesizer = {
# Artificially scale the output grain by the difference in RMS values
# between source and target.
"enforce_intensity": True,
# Specify the ratio limit that is the grain can be scaled by.
"enf_intensity_ratio_limit": 1000.,
# Artificially modify the pitch by the difference in f0 values between
# source and target.
"enforce_f0": True,
# Specify the ratio limit that is the grain can be modified by.
"enf_f0_ratio_limit": 10.,
"grain_size": 100,
"overlap": 8,
# Normalize output, avoid clipping of final output by scaling the final
# frames.
"normalize" : True,
# Defines the number of potential grains to choose from matches when
# synthesizing output.
"match_quantity": 5
}
# Specifies the format for the output file. Changing this has not been tested
# so may produce errors/undesirable results.
output_file = { output_file = {
"samplerate": 44100, "samplerate": 44100,
"format": 131075, "format": 131075,
"channels": 1 "channels": 1
} }
database = {
"symlink": True
}
+19 -18
View File
@@ -15,7 +15,7 @@ used to determine the central point of a signal's amplitude and is calculated
as: as:
.. math:: .. math::
C(n) = \frac{\sum_{i=i_s(n)}^{i_e(n)}(i-i_s(n)) \cdot x(i)}{\sum_{i=i_s(n)}^{i_e(n)} \cdot x(n)} C(n) = \frac{\sum_{i=i_s(n)}^{i_e(n)}(i-i_s(n)) \cdot x(i)}{\sum_{i=i_s(n)}^{i_e(n)} \cdot x(n)}.
Ref: :cite:`lerch2012itaca` Ref: :cite:`lerch2012itaca`
@@ -34,10 +34,10 @@ defined as:
.. math:: .. math::
R_n(m) = \sum_{i=i_s(n)}^{i_e(n)} x(i) x(i-m) R_n(m) = \sum_{i=i_s(n)}^{i_e(n)} x(i) x(i-m)
Then normalizing: then normalizing:
.. math:: .. math::
\Gamma_n(m) = \frac{R_n(m)}{\sqrt{\sum_{i=i_s(n)}^{i_e(n)}x(i)^2 \sum_{i=i_s(n)}^{i_e(n)}x(i-m)^2}} \Gamma_n(m) = \frac{R_n(m)}{\sqrt{\sum_{i=i_s(n)}^{i_e(n)}x(i)^2 \sum_{i=i_s(n)}^{i_e(n)}x(i-m)^2}}.
The fundamental period of the signal is then calculated as the point between The fundamental period of the signal is then calculated as the point between
:math:`T_{min}` and :math:`T_{max}` at which the correlated signal most closely matches the :math:`T_{min}` and :math:`T_{max}` at which the correlated signal most closely matches the
@@ -45,7 +45,7 @@ original. :math:`T_{min}` and :math:`T_{max}` are defined as the minimum and max
the fundamental period. the fundamental period.
.. math:: .. math::
y = arg\,max_{T_{min} \leq m \leq T_{max}} \{\Gamma_i(m)\} y = arg\,max_{T_{min} \leq m \leq T_{max}} \{\Gamma_i(m)\}.
In order to improve the accuracy of peak detection, parabolic interpolation is In order to improve the accuracy of peak detection, parabolic interpolation is
used to estimate the peak's location with greater accuracy by using the peak used to estimate the peak's location with greater accuracy by using the peak
@@ -66,7 +66,7 @@ Ref: :cite:`smith2011sasp`
From this, the fundamental period the frequency is then calculated as: From this, the fundamental period the frequency is then calculated as:
.. math:: .. math::
f_0^n = \frac{1}{T_0^n} f_0^n = \frac{1}{T_0^n}.
Ref: :cite:`itaa2014` Ref: :cite:`itaa2014`
@@ -83,7 +83,7 @@ the signal. The calculation of the STFT is defined as:
.. math:: .. math::
X(k,n) = \sum_{i=i_s(n)}^{i_e(n)} x(i) \exp{\Big(-jk \cdot (i - X(k,n) = \sum_{i=i_s(n)}^{i_e(n)} x(i) \exp{\Big(-jk \cdot (i -
i_s(n))\frac{2\pi}{K}\Big)} i_s(n))\frac{2\pi}{K}\Big)}.
Ref: :cite:`lerch2012itaca` Ref: :cite:`lerch2012itaca`
@@ -96,7 +96,7 @@ of confidence measure in determining the validity of F0 values. It is
calculated as part of the F0 estimation algorithm as: calculated as part of the F0 estimation algorithm as:
.. math:: .. math::
HR(n) = max_{T_{min} \leq m \leq T_{max}}{\{T_n(m)\}} HR(n) = max_{T_{min} \leq m \leq T_{max}}{\{T_n(m)\}}.
Ref: :cite:`lerch2012itaca` Ref: :cite:`lerch2012itaca`
@@ -107,7 +107,7 @@ values indicate a flatter distribution and positive values indicate a more
"peaky" distribution. Kurtosis is calculated as: "peaky" distribution. Kurtosis is calculated as:
.. math:: .. math::
TK(n)=\frac{1}{\sigma_x^4(n) \cdot K}\sum_{i=i_s(n)}^{i_e(n)}\Big(x(i)-\mu_x(n)\Big)^4-3 TK(n)=\frac{1}{\sigma_x^4(n) \cdot K}\sum_{i=i_s(n)}^{i_e(n)}\Big(x(i)-\mu_x(n)\Big)^4-3.
Ref: :cite:`lerch2012itaca` Ref: :cite:`lerch2012itaca`
@@ -117,16 +117,17 @@ Peak amplitude measures the highest peak in the absolute signal. It is
calculated as: calculated as:
.. math:: .. math::
P(n) = \max_{i_s(n) \leq i \leq i_e(n)}\{\left|x(i)\right|\} P(n) = \max_{i_s(n) \leq i \leq i_e(n)}\{\left|x(i)\right|\}.
RMS RMS
~~~ ~~~
The perceived loudness of a signal is an important feature as it can be related The perceived loudness of a signal is an important feature as it can be related
to the dynamics of the signal. RMS is used as a measure of sound intensity and to the dynamics of the signal. RMS is used as a measure of sound intensity and
is used for distinguishing between loud and quiet audio. It is calculated as: is used for distinguishing between loud and quiet audio. It is calculated as,
where $K$ is the total number of samples:
.. math:: .. math::
RMS(n) = \sqrt{\frac{1}{K} \sum_{i=i_s(n)}^{i_e(n)} x(i)^2} RMS(n) = \sqrt{\frac{1}{K} \sum_{i=i_s(n)}^{i_e(n)} x(i)^2}.
Other methods that take the human perception of loudness into account may Other methods that take the human perception of loudness into account may
provide more perceptually relevant results. However the RMS measurement provide more perceptually relevant results. However the RMS measurement
@@ -142,7 +143,7 @@ values indicate that the spectral content is centred in higher frequencies and
lower value indicate a lower centre. The spectral centroid is calculated as: lower value indicate a lower centre. The spectral centroid is calculated as:
.. math:: .. math::
SC(n) = \frac{\sum_{k=0}^{K/2-1} k \cdot | X(k,n) | ^2}{\sum_{k=0}^{K/2-1} | X(k,n) | ^2} SC(n) = \frac{\sum_{k=0}^{K/2-1} k \cdot | X(k,n) | ^2}{\sum_{k=0}^{K/2-1} | X(k,n) | ^2}.
The result is the sum of magnitudes, weighted by their index, normalized by the The result is the sum of magnitudes, weighted by their index, normalized by the
unweighted sum. unweighted sum.
@@ -158,7 +159,7 @@ This differentiates between flat spectrums and sinusoidal spectrums. (low values
representing the former and high values representing the latter.) representing the former and high values representing the latter.)
.. math:: .. math::
SCF = \frac{ \max_{0 \leq k \leq K/2-1} \{| X(k,n) | \}}{\sum_{k=0}^{K/2-1} | X(k,n) | } SCF = \frac{ \max_{0 \leq k \leq K/2-1} \{| X(k,n) | \}}{\sum_{k=0}^{K/2-1} | X(k,n) | }.
Ref: :cite:`lerch2012itaca` Ref: :cite:`lerch2012itaca`
@@ -171,7 +172,7 @@ values that represent a more tonal signal. Spectral flatness is calculated as:
.. math:: .. math::
TFl(n) = \frac{\sqrt[K/2]{\prod_{k=0}^{K/2-1} | X(k,n) | }}{2/K \cdot TFl(n) = \frac{\sqrt[K/2]{\prod_{k=0}^{K/2-1} | X(k,n) | }}{2/K \cdot
\sum_{k=0}^{K/2-1} | X(k,n) | } \sum_{k=0}^{K/2-1} | X(k,n) | }.
Ref: :cite:`lerch2012itaca` Ref: :cite:`lerch2012itaca`
@@ -184,7 +185,7 @@ similar frames (that suggests a steady state signal). It is calculated as:
.. math:: .. math::
SF(n) = \frac{\sqrt{\sum_{k=0}^{K/2-1} \Big( | X(k,n) | - | X(k,n-1) | \Big)^2 SF(n) = \frac{\sqrt{\sum_{k=0}^{K/2-1} \Big( | X(k,n) | - | X(k,n-1) | \Big)^2
}}{K/2} }}{K/2}.
Ref: :cite:`lerch2012itaca` Ref: :cite:`lerch2012itaca`
@@ -196,7 +197,7 @@ and is associated with perceptions of timbre. It is calculated as:
.. math:: .. math::
SS(n) = \sqrt{\frac{\sum_{k=0}^{K/2-1} \Big(k-SC(n)\Big)^2 \cdot | X(k,n) SS(n) = \sqrt{\frac{\sum_{k=0}^{K/2-1} \Big(k-SC(n)\Big)^2 \cdot | X(k,n)
| ^2}{\sum_{k=0}^{K/2-1} | X(k,n) | ^2}} | ^2}{\sum_{k=0}^{K/2-1} | X(k,n) | ^2}}.
Ref: :cite:`lerch2012itaca` Ref: :cite:`lerch2012itaca`
@@ -206,7 +207,7 @@ The variance of a signal measures it's spread around the signal's arithmetic
mean. It is used in the calculation of Kurtosis and is calculated as: mean. It is used in the calculation of Kurtosis and is calculated as:
.. math:: .. math::
\sigma_x^2 = \frac{1}{K} \sum_{i=i_s(n)}^{i_e(n)}(x(i) - \mu_x(n))^2 \sigma_x^2 = \frac{1}{K} \sum_{i=i_s(n)}^{i_e(n)}(x(i) - \mu_x(n))^2.
Ref: :cite:`lerch2012itaca` Ref: :cite:`lerch2012itaca`
@@ -218,7 +219,7 @@ a signal, as noisy signals will pass from positive to negative more frequently
than period signals. It is calculated as: than period signals. It is calculated as:
.. math:: .. math::
Z(n) = \frac{1}{2K} \sum_{i=i_s(n)}^{i_e(n)} | sgn[x(i)] - sgn[x(i-1)] | Z(n) = \frac{1}{2K} \sum_{i=i_s(n)}^{i_e(n)} | sgn[x(i)] - sgn[x(i-1)] |
\text{Where the sgn function is defined as:} \text{Where the sgn function is defined as:}
+116 -13
View File
@@ -1,36 +1,139 @@
# Specify analysis parameters for root mean square analysis.
rms = { rms = {
# Analysis window sizes can be changed for each analysis individually.
# These do not need to match the grain size of the matcher or synthesis.
"window_size": 100, "window_size": 100,
"overlap": 2, "overlap": 8,
} }
analysis_dict = { f0 = {
"f0": "log2_median", "window_size": 4096,
"rms": "mean" "overlap": 8,
# Currently all frames below this ratio are digaurded and left as silence.
# Different databases will require different values for the best results.
# Noisier databases will need lower values than more tonal databases.
"ratio_threshold": 0.45
} }
# Specify analysis parameters for variance analysis.
variance = {
"window_size": 100,
"overlap": 8
}
# Specify analysis parameters for temporal kurtosis analysis.
kurtosis = {
"window_size": 100,
"overlap": 8
}
# Specify analysis parameters for temporal skewness analysis.
skewness = {
"window_size": 100,
"overlap": 8
}
# Specify analysis parameters for FFT analysis.
fft = {
# The FFT window size determines the window size for all spectral analyses.
"window_size": 4096
}
database = {
# Enables creation of symbolic links to files not in the database rather
# than making pysical copies.
"symlink": True
}
# Sets the weighting for each analysis. a higher weighting gives an analysis
# higher presendence when finding the best matches.
matcher_weightings = { matcher_weightings = {
"f0" : 1., "f0" : 0.5,
"rms": 1. "spccntr" : 1.,
"spcsprd" : 1.,
"spcflux" : 3.,
"spccf" : 3.,
"spcflatness": 3.,
"zerox" : 1.,
"rms" : 0.1,
"peak": 0.1,
"centroid": 0.5,
"kurtosis": 2.,
"skewness": 2.,
"variance": 0.,
"harm_ratio": 2
}
# Specifies the method for averaging analysis frames to create a single value
# for comparing to other grains. Possible formatters are: 'mean', 'median',
# 'log2_mean', 'log2_median'
analysis_dict = {
# log2_median formats using mel scale. This is useful for analyses such as
# F0.
"f0": "log2_median",
"rms": "mean",
"zerox": "mean",
"spccntr": "median",
"spcsprd": "median",
"spcflux": "median",
"spccf": "median",
"spcflatness": "median",
"peak": "mean",
"centroid": "mean",
"kurtosis": "mean",
"skewness": "mean",
"variance": "mean",
"harm_ratio": "mean"
} }
analysis = { analysis = {
# Force the deletion of any pre-existing analyses to create new ones. This
# is needed for overwriting old analyses generated with different
# parameters to the current ones.
"reanalyse": False "reanalyse": False
} }
matcher = { matcher = {
# Force the re-matching of analyses
"rematch": False, "rematch": False,
# This value must be the same as the synthesis grain size to avoid the
# speeding up or slowing down of the resulting file in relation to the
# original.
"grain_size": 100, "grain_size": 100,
"overlap": 2, "overlap": 8,
# Defines the number of matches to keep for synthesis. # Defines the number of matches to keep for synthesis. Note that this must
"match_quantity": 20 # also be specified in the synthesis config
"match_quantity": 5,
# Choose the algorithm used to perform matching. kdtree is recommended for
# larger datasets.
"method": 'kdtree'
} }
synthesizer = {
# Artificially scale the output grain by the difference in RMS values
# between source and target.
"enforce_intensity": True,
# Specify the ratio limit that is the grain can be scaled by.
"enf_intensity_ratio_limit": 1000.,
# Artificially modify the pitch by the difference in f0 values between
# source and target.
"enforce_f0": True,
# Specify the ratio limit that is the grain can be modified by.
"enf_f0_ratio_limit": 10.,
"grain_size": 100,
"overlap": 8,
# Normalize output, avoid clipping of final output by scaling the final
# frames.
"normalize" : True,
# Defines the number of potential grains to choose from matches when
# synthesizing output.
"match_quantity": 5
}
# Specifies the format for the output file. Changing this has not been tested
# so may produce errors/undesirable results.
output_file = { output_file = {
"samplerate": 44100, "samplerate": 44100,
"format": 131075, "format": 131075,
"channels": 1 "channels": 1
} }
database = {
"symlink": True
}
+122 -19
View File
@@ -1,36 +1,139 @@
# Specify analysis parameters for root mean square analysis.
rms = { rms = {
# Analysis window sizes can be changed for each analysis individually.
# These do not need to match the grain size of the matcher or synthesis.
"window_size": 100, "window_size": 100,
"overlap": 2, "overlap": 8,
} }
f0 = {
"window_size": 4096,
"overlap": 8,
# Currently all frames below this ratio are digaurded and left as silence.
# Different databases will require different values for the best results.
# Noisier databases will need lower values than more tonal databases.
"ratio_threshold": 0.45
}
# Specify analysis parameters for variance analysis.
variance = {
"window_size": 100,
"overlap": 8
}
# Specify analysis parameters for temporal kurtosis analysis.
kurtosis = {
"window_size": 100,
"overlap": 8
}
# Specify analysis parameters for temporal skewness analysis.
skewness = {
"window_size": 100,
"overlap": 8
}
# Specify analysis parameters for FFT analysis.
fft = {
# The FFT window size determines the window size for all spectral analyses.
"window_size": 4096
}
database = {
# Enables creation of symbolic links to files not in the database rather
# than making pysical copies.
"symlink": True
}
# Sets the weighting for each analysis. a higher weighting gives an analysis
# higher presendence when finding the best matches.
matcher_weightings = {
"f0" : 0.5,
"spccntr" : 1.,
"spcsprd" : 1.,
"spcflux" : 3.,
"spccf" : 3.,
"spcflatness": 3.,
"zerox" : 1.,
"rms" : 0.1,
"peak": 0.1,
"centroid": 0.5,
"kurtosis": 2.,
"skewness": 2.,
"variance": 0.,
"harm_ratio": 2
}
# Specifies the method for averaging analysis frames to create a single value
# for comparing to other grains. Possible formatters are: 'mean', 'median',
# 'log2_mean', 'log2_median'
analysis_dict = { analysis_dict = {
# log2_median formats using mel scale. This is useful for analyses such as
# F0.
"f0": "log2_median", "f0": "log2_median",
"rms": "mean" "rms": "mean",
"zerox": "mean",
"spccntr": "median",
"spcsprd": "median",
"spcflux": "median",
"spccf": "median",
"spcflatness": "median",
"peak": "mean",
"centroid": "mean",
"kurtosis": "mean",
"skewness": "mean",
"variance": "mean",
"harm_ratio": "mean"
} }
analysis = { analysis = {
# Force the deletion of any pre-existing analyses to create new ones. This
# is needed for overwriting old analyses generated with different
# parameters to the current ones.
"reanalyse": False "reanalyse": False
} }
matcher = {
# Force the re-matching of analyses
"rematch": False,
# This value must be the same as the synthesis grain size to avoid the
# speeding up or slowing down of the resulting file in relation to the
# original.
"grain_size": 100,
"overlap": 8,
# Defines the number of matches to keep for synthesis. Note that this must
# also be specified in the synthesis config
"match_quantity": 5,
# Choose the algorithm used to perform matching. kdtree is recommended for
# larger datasets.
"method": 'kdtree'
}
synthesizer = {
# Artificially scale the output grain by the difference in RMS values
# between source and target.
"enforce_intensity": True,
# Specify the ratio limit that is the grain can be scaled by.
"enf_intensity_ratio_limit": 1000.,
# Artificially modify the pitch by the difference in f0 values between
# source and target.
"enforce_f0": True,
# Specify the ratio limit that is the grain can be modified by.
"enf_f0_ratio_limit": 10.,
"grain_size": 100,
"overlap": 8,
# Normalize output, avoid clipping of final output by scaling the final
# frames.
"normalize" : True,
# Defines the number of potential grains to choose from matches when
# synthesizing output.
"match_quantity": 5
}
# Specifies the format for the output file. Changing this has not been tested
# so may produce errors/undesirable results.
output_file = { output_file = {
"samplerate": 44100, "samplerate": 44100,
"format": 131075, "format": 131075,
"channels": 1 "channels": 1
} }
synthesizer = {
"enforce_rms": True,
"enf_rms_ratio_limit": 5.,
"enforce_f0": True,
"enf_f0_ratio_limit": 10.,
"grain_size": 100,
"overlap": 2,
"normalize" : True,
# Defines the number of potential grains to choose from matches when
# synthesizing output.
"match_quantity": 20
}
database = {
"symlink": True
}
+4 -2
View File
@@ -84,7 +84,7 @@ For this demonstration, the following file structure will be used:
|-- target.03.wav |-- target.03.wav
`-- target.04.wav `-- target.04.wav
A source database containing a small selection of trumpet samples (aquired from A source database containing a small selection of trumpet samples (acquired from
http://theremin.music.uiowa.edu/MIS.html) will be used to match grains with 4 http://theremin.music.uiowa.edu/MIS.html) will be used to match grains with 4
target sounds. This will produce 4 output files, one for each target sound. target sounds. This will produce 4 output files, one for each target sound.
@@ -417,7 +417,9 @@ concatenate.py Script Flags
--match_method Choose the algorithm to use when matching analyses. Available algorithms are: --match_method Choose the algorithm to use when matching analyses. Available algorithms are:
Brute force: 'bruteforce' Brute force: 'bruteforce' (BROKEN. The brute force
matcher no longer works with the current release of
this script. Use the K-d Tree Search.)
K-d Tree Search: 'kdtree' K-d Tree Search: 'kdtree'