Added sklearn to install script
This commit is contained in:
@@ -5,3 +5,4 @@ pip install pysndfile
|
|||||||
pip install h5py
|
pip install h5py
|
||||||
pip install https://github.com/Pezz89/fileops/zipball/master
|
pip install https://github.com/Pezz89/fileops/zipball/master
|
||||||
pip install -e ./
|
pip install -e ./
|
||||||
|
pip install sklearn
|
||||||
|
|||||||
@@ -1,23 +1,139 @@
|
|||||||
|
# Specify analysis parameters for root mean square analysis.
|
||||||
rms = {
|
rms = {
|
||||||
|
# Analysis window sizes can be changed for each analysis individually.
|
||||||
|
# These do not need to match the grain size of the matcher or synthesis.
|
||||||
"window_size": 100,
|
"window_size": 100,
|
||||||
"overlap": 2,
|
"overlap": 8,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
f0 = {
|
||||||
|
"window_size": 4096,
|
||||||
|
"overlap": 8,
|
||||||
|
# Currently all frames below this ratio are digaurded and left as silence.
|
||||||
|
# Different databases will require different values for the best results.
|
||||||
|
# Noisier databases will need lower values than more tonal databases.
|
||||||
|
"ratio_threshold": 0.45
|
||||||
|
}
|
||||||
|
|
||||||
|
# Specify analysis parameters for variance analysis.
|
||||||
|
variance = {
|
||||||
|
"window_size": 100,
|
||||||
|
"overlap": 8
|
||||||
|
}
|
||||||
|
|
||||||
|
# Specify analysis parameters for temporal kurtosis analysis.
|
||||||
|
kurtosis = {
|
||||||
|
"window_size": 100,
|
||||||
|
"overlap": 8
|
||||||
|
}
|
||||||
|
|
||||||
|
# Specify analysis parameters for temporal skewness analysis.
|
||||||
|
skewness = {
|
||||||
|
"window_size": 100,
|
||||||
|
"overlap": 8
|
||||||
|
}
|
||||||
|
|
||||||
|
# Specify analysis parameters for FFT analysis.
|
||||||
|
fft = {
|
||||||
|
# The FFT window size determines the window size for all spectral analyses.
|
||||||
|
"window_size": 4096
|
||||||
|
}
|
||||||
|
|
||||||
|
database = {
|
||||||
|
# Enables creation of symbolic links to files not in the database rather
|
||||||
|
# than making pysical copies.
|
||||||
|
"symlink": True
|
||||||
|
}
|
||||||
|
|
||||||
|
# Sets the weighting for each analysis. a higher weighting gives an analysis
|
||||||
|
# higher presendence when finding the best matches.
|
||||||
|
matcher_weightings = {
|
||||||
|
"f0" : 0.5,
|
||||||
|
"spccntr" : 1.,
|
||||||
|
"spcsprd" : 1.,
|
||||||
|
"spcflux" : 3.,
|
||||||
|
"spccf" : 3.,
|
||||||
|
"spcflatness": 3.,
|
||||||
|
"zerox" : 1.,
|
||||||
|
"rms" : 0.1,
|
||||||
|
"peak": 0.1,
|
||||||
|
"centroid": 0.5,
|
||||||
|
"kurtosis": 2.,
|
||||||
|
"skewness": 2.,
|
||||||
|
"variance": 0.,
|
||||||
|
"harm_ratio": 2
|
||||||
|
}
|
||||||
|
|
||||||
|
# Specifies the method for averaging analysis frames to create a single value
|
||||||
|
# for comparing to other grains. Possible formatters are: 'mean', 'median',
|
||||||
|
# 'log2_mean', 'log2_median'
|
||||||
analysis_dict = {
|
analysis_dict = {
|
||||||
|
# log2_median formats using mel scale. This is useful for analyses such as
|
||||||
|
# F0.
|
||||||
"f0": "log2_median",
|
"f0": "log2_median",
|
||||||
"rms": "mean"
|
"rms": "mean",
|
||||||
|
"zerox": "mean",
|
||||||
|
"spccntr": "median",
|
||||||
|
"spcsprd": "median",
|
||||||
|
"spcflux": "median",
|
||||||
|
"spccf": "median",
|
||||||
|
"spcflatness": "median",
|
||||||
|
"peak": "mean",
|
||||||
|
"centroid": "mean",
|
||||||
|
"kurtosis": "mean",
|
||||||
|
"skewness": "mean",
|
||||||
|
"variance": "mean",
|
||||||
|
"harm_ratio": "mean"
|
||||||
}
|
}
|
||||||
|
|
||||||
analysis = {
|
analysis = {
|
||||||
|
# Force the deletion of any pre-existing analyses to create new ones. This
|
||||||
|
# is needed for overwriting old analyses generated with different
|
||||||
|
# parameters to the current ones.
|
||||||
"reanalyse": False
|
"reanalyse": False
|
||||||
}
|
}
|
||||||
|
|
||||||
|
matcher = {
|
||||||
|
# Force the re-matching of analyses
|
||||||
|
"rematch": False,
|
||||||
|
# This value must be the same as the synthesis grain size to avoid the
|
||||||
|
# speeding up or slowing down of the resulting file in relation to the
|
||||||
|
# original.
|
||||||
|
"grain_size": 100,
|
||||||
|
"overlap": 8,
|
||||||
|
# Defines the number of matches to keep for synthesis. Note that this must
|
||||||
|
# also be specified in the synthesis config
|
||||||
|
"match_quantity": 5,
|
||||||
|
# Choose the algorithm used to perform matching. kdtree is recommended for
|
||||||
|
# larger datasets.
|
||||||
|
"method": 'kdtree'
|
||||||
|
}
|
||||||
|
|
||||||
|
synthesizer = {
|
||||||
|
# Artificially scale the output grain by the difference in RMS values
|
||||||
|
# between source and target.
|
||||||
|
"enforce_intensity": True,
|
||||||
|
# Specify the ratio limit that is the grain can be scaled by.
|
||||||
|
"enf_intensity_ratio_limit": 1000.,
|
||||||
|
# Artificially modify the pitch by the difference in f0 values between
|
||||||
|
# source and target.
|
||||||
|
"enforce_f0": True,
|
||||||
|
# Specify the ratio limit that is the grain can be modified by.
|
||||||
|
"enf_f0_ratio_limit": 10.,
|
||||||
|
"grain_size": 100,
|
||||||
|
"overlap": 8,
|
||||||
|
# Normalize output, avoid clipping of final output by scaling the final
|
||||||
|
# frames.
|
||||||
|
"normalize" : True,
|
||||||
|
# Defines the number of potential grains to choose from matches when
|
||||||
|
# synthesizing output.
|
||||||
|
"match_quantity": 5
|
||||||
|
}
|
||||||
|
|
||||||
|
# Specifies the format for the output file. Changing this has not been tested
|
||||||
|
# so may produce errors/undesirable results.
|
||||||
output_file = {
|
output_file = {
|
||||||
"samplerate": 44100,
|
"samplerate": 44100,
|
||||||
"format": 131075,
|
"format": 131075,
|
||||||
"channels": 1
|
"channels": 1
|
||||||
}
|
}
|
||||||
|
|
||||||
database = {
|
|
||||||
"symlink": True
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ used to determine the central point of a signal's amplitude and is calculated
|
|||||||
as:
|
as:
|
||||||
|
|
||||||
.. math::
|
.. math::
|
||||||
C(n) = \frac{\sum_{i=i_s(n)}^{i_e(n)}(i-i_s(n)) \cdot x(i)}{\sum_{i=i_s(n)}^{i_e(n)} \cdot x(n)}
|
C(n) = \frac{\sum_{i=i_s(n)}^{i_e(n)}(i-i_s(n)) \cdot x(i)}{\sum_{i=i_s(n)}^{i_e(n)} \cdot x(n)}.
|
||||||
|
|
||||||
Ref: :cite:`lerch2012itaca`
|
Ref: :cite:`lerch2012itaca`
|
||||||
|
|
||||||
@@ -34,10 +34,10 @@ defined as:
|
|||||||
.. math::
|
.. math::
|
||||||
R_n(m) = \sum_{i=i_s(n)}^{i_e(n)} x(i) x(i-m)
|
R_n(m) = \sum_{i=i_s(n)}^{i_e(n)} x(i) x(i-m)
|
||||||
|
|
||||||
Then normalizing:
|
then normalizing:
|
||||||
|
|
||||||
.. math::
|
.. math::
|
||||||
\Gamma_n(m) = \frac{R_n(m)}{\sqrt{\sum_{i=i_s(n)}^{i_e(n)}x(i)^2 \sum_{i=i_s(n)}^{i_e(n)}x(i-m)^2}}
|
\Gamma_n(m) = \frac{R_n(m)}{\sqrt{\sum_{i=i_s(n)}^{i_e(n)}x(i)^2 \sum_{i=i_s(n)}^{i_e(n)}x(i-m)^2}}.
|
||||||
|
|
||||||
The fundamental period of the signal is then calculated as the point between
|
The fundamental period of the signal is then calculated as the point between
|
||||||
:math:`T_{min}` and :math:`T_{max}` at which the correlated signal most closely matches the
|
:math:`T_{min}` and :math:`T_{max}` at which the correlated signal most closely matches the
|
||||||
@@ -45,7 +45,7 @@ original. :math:`T_{min}` and :math:`T_{max}` are defined as the minimum and max
|
|||||||
the fundamental period.
|
the fundamental period.
|
||||||
|
|
||||||
.. math::
|
.. math::
|
||||||
y = arg\,max_{T_{min} \leq m \leq T_{max}} \{\Gamma_i(m)\}
|
y = arg\,max_{T_{min} \leq m \leq T_{max}} \{\Gamma_i(m)\}.
|
||||||
|
|
||||||
In order to improve the accuracy of peak detection, parabolic interpolation is
|
In order to improve the accuracy of peak detection, parabolic interpolation is
|
||||||
used to estimate the peak's location with greater accuracy by using the peak
|
used to estimate the peak's location with greater accuracy by using the peak
|
||||||
@@ -66,7 +66,7 @@ Ref: :cite:`smith2011sasp`
|
|||||||
From this, the fundamental period the frequency is then calculated as:
|
From this, the fundamental period the frequency is then calculated as:
|
||||||
|
|
||||||
.. math::
|
.. math::
|
||||||
f_0^n = \frac{1}{T_0^n}
|
f_0^n = \frac{1}{T_0^n}.
|
||||||
|
|
||||||
Ref: :cite:`itaa2014`
|
Ref: :cite:`itaa2014`
|
||||||
|
|
||||||
@@ -83,7 +83,7 @@ the signal. The calculation of the STFT is defined as:
|
|||||||
|
|
||||||
.. math::
|
.. math::
|
||||||
X(k,n) = \sum_{i=i_s(n)}^{i_e(n)} x(i) \exp{\Big(-jk \cdot (i -
|
X(k,n) = \sum_{i=i_s(n)}^{i_e(n)} x(i) \exp{\Big(-jk \cdot (i -
|
||||||
i_s(n))\frac{2\pi}{K}\Big)}
|
i_s(n))\frac{2\pi}{K}\Big)}.
|
||||||
|
|
||||||
Ref: :cite:`lerch2012itaca`
|
Ref: :cite:`lerch2012itaca`
|
||||||
|
|
||||||
@@ -96,7 +96,7 @@ of confidence measure in determining the validity of F0 values. It is
|
|||||||
calculated as part of the F0 estimation algorithm as:
|
calculated as part of the F0 estimation algorithm as:
|
||||||
|
|
||||||
.. math::
|
.. math::
|
||||||
HR(n) = max_{T_{min} \leq m \leq T_{max}}{\{T_n(m)\}}
|
HR(n) = max_{T_{min} \leq m \leq T_{max}}{\{T_n(m)\}}.
|
||||||
|
|
||||||
Ref: :cite:`lerch2012itaca`
|
Ref: :cite:`lerch2012itaca`
|
||||||
|
|
||||||
@@ -107,7 +107,7 @@ values indicate a flatter distribution and positive values indicate a more
|
|||||||
"peaky" distribution. Kurtosis is calculated as:
|
"peaky" distribution. Kurtosis is calculated as:
|
||||||
|
|
||||||
.. math::
|
.. math::
|
||||||
TK(n)=\frac{1}{\sigma_x^4(n) \cdot K}\sum_{i=i_s(n)}^{i_e(n)}\Big(x(i)-\mu_x(n)\Big)^4-3
|
TK(n)=\frac{1}{\sigma_x^4(n) \cdot K}\sum_{i=i_s(n)}^{i_e(n)}\Big(x(i)-\mu_x(n)\Big)^4-3.
|
||||||
|
|
||||||
Ref: :cite:`lerch2012itaca`
|
Ref: :cite:`lerch2012itaca`
|
||||||
|
|
||||||
@@ -117,16 +117,17 @@ Peak amplitude measures the highest peak in the absolute signal. It is
|
|||||||
calculated as:
|
calculated as:
|
||||||
|
|
||||||
.. math::
|
.. math::
|
||||||
P(n) = \max_{i_s(n) \leq i \leq i_e(n)}\{\left|x(i)\right|\}
|
P(n) = \max_{i_s(n) \leq i \leq i_e(n)}\{\left|x(i)\right|\}.
|
||||||
|
|
||||||
RMS
|
RMS
|
||||||
~~~
|
~~~
|
||||||
The perceived loudness of a signal is an important feature as it can be related
|
The perceived loudness of a signal is an important feature as it can be related
|
||||||
to the dynamics of the signal. RMS is used as a measure of sound intensity and
|
to the dynamics of the signal. RMS is used as a measure of sound intensity and
|
||||||
is used for distinguishing between loud and quiet audio. It is calculated as:
|
is used for distinguishing between loud and quiet audio. It is calculated as,
|
||||||
|
where $K$ is the total number of samples:
|
||||||
|
|
||||||
.. math::
|
.. math::
|
||||||
RMS(n) = \sqrt{\frac{1}{K} \sum_{i=i_s(n)}^{i_e(n)} x(i)^2}
|
RMS(n) = \sqrt{\frac{1}{K} \sum_{i=i_s(n)}^{i_e(n)} x(i)^2}.
|
||||||
|
|
||||||
Other methods that take the human perception of loudness into account may
|
Other methods that take the human perception of loudness into account may
|
||||||
provide more perceptually relevant results. However the RMS measurement
|
provide more perceptually relevant results. However the RMS measurement
|
||||||
@@ -142,7 +143,7 @@ values indicate that the spectral content is centred in higher frequencies and
|
|||||||
lower value indicate a lower centre. The spectral centroid is calculated as:
|
lower value indicate a lower centre. The spectral centroid is calculated as:
|
||||||
|
|
||||||
.. math::
|
.. math::
|
||||||
SC(n) = \frac{\sum_{k=0}^{K/2-1} k \cdot | X(k,n) | ^2}{\sum_{k=0}^{K/2-1} | X(k,n) | ^2}
|
SC(n) = \frac{\sum_{k=0}^{K/2-1} k \cdot | X(k,n) | ^2}{\sum_{k=0}^{K/2-1} | X(k,n) | ^2}.
|
||||||
|
|
||||||
The result is the sum of magnitudes, weighted by their index, normalized by the
|
The result is the sum of magnitudes, weighted by their index, normalized by the
|
||||||
unweighted sum.
|
unweighted sum.
|
||||||
@@ -158,7 +159,7 @@ This differentiates between flat spectrums and sinusoidal spectrums. (low values
|
|||||||
representing the former and high values representing the latter.)
|
representing the former and high values representing the latter.)
|
||||||
|
|
||||||
.. math::
|
.. math::
|
||||||
SCF = \frac{ \max_{0 \leq k \leq K/2-1} \{| X(k,n) | \}}{\sum_{k=0}^{K/2-1} | X(k,n) | }
|
SCF = \frac{ \max_{0 \leq k \leq K/2-1} \{| X(k,n) | \}}{\sum_{k=0}^{K/2-1} | X(k,n) | }.
|
||||||
|
|
||||||
Ref: :cite:`lerch2012itaca`
|
Ref: :cite:`lerch2012itaca`
|
||||||
|
|
||||||
@@ -171,7 +172,7 @@ values that represent a more tonal signal. Spectral flatness is calculated as:
|
|||||||
|
|
||||||
.. math::
|
.. math::
|
||||||
TFl(n) = \frac{\sqrt[K/2]{\prod_{k=0}^{K/2-1} | X(k,n) | }}{2/K \cdot
|
TFl(n) = \frac{\sqrt[K/2]{\prod_{k=0}^{K/2-1} | X(k,n) | }}{2/K \cdot
|
||||||
\sum_{k=0}^{K/2-1} | X(k,n) | }
|
\sum_{k=0}^{K/2-1} | X(k,n) | }.
|
||||||
|
|
||||||
Ref: :cite:`lerch2012itaca`
|
Ref: :cite:`lerch2012itaca`
|
||||||
|
|
||||||
@@ -184,7 +185,7 @@ similar frames (that suggests a steady state signal). It is calculated as:
|
|||||||
|
|
||||||
.. math::
|
.. math::
|
||||||
SF(n) = \frac{\sqrt{\sum_{k=0}^{K/2-1} \Big( | X(k,n) | - | X(k,n-1) | \Big)^2
|
SF(n) = \frac{\sqrt{\sum_{k=0}^{K/2-1} \Big( | X(k,n) | - | X(k,n-1) | \Big)^2
|
||||||
}}{K/2}
|
}}{K/2}.
|
||||||
|
|
||||||
Ref: :cite:`lerch2012itaca`
|
Ref: :cite:`lerch2012itaca`
|
||||||
|
|
||||||
@@ -196,7 +197,7 @@ and is associated with perceptions of timbre. It is calculated as:
|
|||||||
|
|
||||||
.. math::
|
.. math::
|
||||||
SS(n) = \sqrt{\frac{\sum_{k=0}^{K/2-1} \Big(k-SC(n)\Big)^2 \cdot | X(k,n)
|
SS(n) = \sqrt{\frac{\sum_{k=0}^{K/2-1} \Big(k-SC(n)\Big)^2 \cdot | X(k,n)
|
||||||
| ^2}{\sum_{k=0}^{K/2-1} | X(k,n) | ^2}}
|
| ^2}{\sum_{k=0}^{K/2-1} | X(k,n) | ^2}}.
|
||||||
|
|
||||||
Ref: :cite:`lerch2012itaca`
|
Ref: :cite:`lerch2012itaca`
|
||||||
|
|
||||||
@@ -206,7 +207,7 @@ The variance of a signal measures it's spread around the signal's arithmetic
|
|||||||
mean. It is used in the calculation of Kurtosis and is calculated as:
|
mean. It is used in the calculation of Kurtosis and is calculated as:
|
||||||
|
|
||||||
.. math::
|
.. math::
|
||||||
\sigma_x^2 = \frac{1}{K} \sum_{i=i_s(n)}^{i_e(n)}(x(i) - \mu_x(n))^2
|
\sigma_x^2 = \frac{1}{K} \sum_{i=i_s(n)}^{i_e(n)}(x(i) - \mu_x(n))^2.
|
||||||
|
|
||||||
Ref: :cite:`lerch2012itaca`
|
Ref: :cite:`lerch2012itaca`
|
||||||
|
|
||||||
@@ -218,7 +219,7 @@ a signal, as noisy signals will pass from positive to negative more frequently
|
|||||||
than period signals. It is calculated as:
|
than period signals. It is calculated as:
|
||||||
|
|
||||||
.. math::
|
.. math::
|
||||||
Z(n) = \frac{1}{2K} \sum_{i=i_s(n)}^{i_e(n)} | sgn[x(i)] - sgn[x(i-1)] |
|
Z(n) = \frac{1}{2K} \sum_{i=i_s(n)}^{i_e(n)} | sgn[x(i)] - sgn[x(i-1)] |
|
||||||
|
|
||||||
\text{Where the sgn function is defined as:}
|
\text{Where the sgn function is defined as:}
|
||||||
|
|
||||||
|
|||||||
@@ -1,36 +1,139 @@
|
|||||||
|
# Specify analysis parameters for root mean square analysis.
|
||||||
rms = {
|
rms = {
|
||||||
|
# Analysis window sizes can be changed for each analysis individually.
|
||||||
|
# These do not need to match the grain size of the matcher or synthesis.
|
||||||
"window_size": 100,
|
"window_size": 100,
|
||||||
"overlap": 2,
|
"overlap": 8,
|
||||||
}
|
}
|
||||||
|
|
||||||
analysis_dict = {
|
f0 = {
|
||||||
"f0": "log2_median",
|
"window_size": 4096,
|
||||||
"rms": "mean"
|
"overlap": 8,
|
||||||
|
# Currently all frames below this ratio are digaurded and left as silence.
|
||||||
|
# Different databases will require different values for the best results.
|
||||||
|
# Noisier databases will need lower values than more tonal databases.
|
||||||
|
"ratio_threshold": 0.45
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Specify analysis parameters for variance analysis.
|
||||||
|
variance = {
|
||||||
|
"window_size": 100,
|
||||||
|
"overlap": 8
|
||||||
|
}
|
||||||
|
|
||||||
|
# Specify analysis parameters for temporal kurtosis analysis.
|
||||||
|
kurtosis = {
|
||||||
|
"window_size": 100,
|
||||||
|
"overlap": 8
|
||||||
|
}
|
||||||
|
|
||||||
|
# Specify analysis parameters for temporal skewness analysis.
|
||||||
|
skewness = {
|
||||||
|
"window_size": 100,
|
||||||
|
"overlap": 8
|
||||||
|
}
|
||||||
|
|
||||||
|
# Specify analysis parameters for FFT analysis.
|
||||||
|
fft = {
|
||||||
|
# The FFT window size determines the window size for all spectral analyses.
|
||||||
|
"window_size": 4096
|
||||||
|
}
|
||||||
|
|
||||||
|
database = {
|
||||||
|
# Enables creation of symbolic links to files not in the database rather
|
||||||
|
# than making pysical copies.
|
||||||
|
"symlink": True
|
||||||
|
}
|
||||||
|
|
||||||
|
# Sets the weighting for each analysis. a higher weighting gives an analysis
|
||||||
|
# higher presendence when finding the best matches.
|
||||||
matcher_weightings = {
|
matcher_weightings = {
|
||||||
"f0" : 1.,
|
"f0" : 0.5,
|
||||||
"rms": 1.
|
"spccntr" : 1.,
|
||||||
|
"spcsprd" : 1.,
|
||||||
|
"spcflux" : 3.,
|
||||||
|
"spccf" : 3.,
|
||||||
|
"spcflatness": 3.,
|
||||||
|
"zerox" : 1.,
|
||||||
|
"rms" : 0.1,
|
||||||
|
"peak": 0.1,
|
||||||
|
"centroid": 0.5,
|
||||||
|
"kurtosis": 2.,
|
||||||
|
"skewness": 2.,
|
||||||
|
"variance": 0.,
|
||||||
|
"harm_ratio": 2
|
||||||
|
}
|
||||||
|
|
||||||
|
# Specifies the method for averaging analysis frames to create a single value
|
||||||
|
# for comparing to other grains. Possible formatters are: 'mean', 'median',
|
||||||
|
# 'log2_mean', 'log2_median'
|
||||||
|
analysis_dict = {
|
||||||
|
# log2_median formats using mel scale. This is useful for analyses such as
|
||||||
|
# F0.
|
||||||
|
"f0": "log2_median",
|
||||||
|
"rms": "mean",
|
||||||
|
"zerox": "mean",
|
||||||
|
"spccntr": "median",
|
||||||
|
"spcsprd": "median",
|
||||||
|
"spcflux": "median",
|
||||||
|
"spccf": "median",
|
||||||
|
"spcflatness": "median",
|
||||||
|
"peak": "mean",
|
||||||
|
"centroid": "mean",
|
||||||
|
"kurtosis": "mean",
|
||||||
|
"skewness": "mean",
|
||||||
|
"variance": "mean",
|
||||||
|
"harm_ratio": "mean"
|
||||||
}
|
}
|
||||||
|
|
||||||
analysis = {
|
analysis = {
|
||||||
|
# Force the deletion of any pre-existing analyses to create new ones. This
|
||||||
|
# is needed for overwriting old analyses generated with different
|
||||||
|
# parameters to the current ones.
|
||||||
"reanalyse": False
|
"reanalyse": False
|
||||||
}
|
}
|
||||||
|
|
||||||
matcher = {
|
matcher = {
|
||||||
|
# Force the re-matching of analyses
|
||||||
"rematch": False,
|
"rematch": False,
|
||||||
|
# This value must be the same as the synthesis grain size to avoid the
|
||||||
|
# speeding up or slowing down of the resulting file in relation to the
|
||||||
|
# original.
|
||||||
"grain_size": 100,
|
"grain_size": 100,
|
||||||
"overlap": 2,
|
"overlap": 8,
|
||||||
# Defines the number of matches to keep for synthesis.
|
# Defines the number of matches to keep for synthesis. Note that this must
|
||||||
"match_quantity": 20
|
# also be specified in the synthesis config
|
||||||
|
"match_quantity": 5,
|
||||||
|
# Choose the algorithm used to perform matching. kdtree is recommended for
|
||||||
|
# larger datasets.
|
||||||
|
"method": 'kdtree'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
synthesizer = {
|
||||||
|
# Artificially scale the output grain by the difference in RMS values
|
||||||
|
# between source and target.
|
||||||
|
"enforce_intensity": True,
|
||||||
|
# Specify the ratio limit that is the grain can be scaled by.
|
||||||
|
"enf_intensity_ratio_limit": 1000.,
|
||||||
|
# Artificially modify the pitch by the difference in f0 values between
|
||||||
|
# source and target.
|
||||||
|
"enforce_f0": True,
|
||||||
|
# Specify the ratio limit that is the grain can be modified by.
|
||||||
|
"enf_f0_ratio_limit": 10.,
|
||||||
|
"grain_size": 100,
|
||||||
|
"overlap": 8,
|
||||||
|
# Normalize output, avoid clipping of final output by scaling the final
|
||||||
|
# frames.
|
||||||
|
"normalize" : True,
|
||||||
|
# Defines the number of potential grains to choose from matches when
|
||||||
|
# synthesizing output.
|
||||||
|
"match_quantity": 5
|
||||||
|
}
|
||||||
|
|
||||||
|
# Specifies the format for the output file. Changing this has not been tested
|
||||||
|
# so may produce errors/undesirable results.
|
||||||
output_file = {
|
output_file = {
|
||||||
"samplerate": 44100,
|
"samplerate": 44100,
|
||||||
"format": 131075,
|
"format": 131075,
|
||||||
"channels": 1
|
"channels": 1
|
||||||
}
|
}
|
||||||
|
|
||||||
database = {
|
|
||||||
"symlink": True
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,36 +1,139 @@
|
|||||||
|
# Specify analysis parameters for root mean square analysis.
|
||||||
rms = {
|
rms = {
|
||||||
|
# Analysis window sizes can be changed for each analysis individually.
|
||||||
|
# These do not need to match the grain size of the matcher or synthesis.
|
||||||
"window_size": 100,
|
"window_size": 100,
|
||||||
"overlap": 2,
|
"overlap": 8,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
f0 = {
|
||||||
|
"window_size": 4096,
|
||||||
|
"overlap": 8,
|
||||||
|
# Currently all frames below this ratio are digaurded and left as silence.
|
||||||
|
# Different databases will require different values for the best results.
|
||||||
|
# Noisier databases will need lower values than more tonal databases.
|
||||||
|
"ratio_threshold": 0.45
|
||||||
|
}
|
||||||
|
|
||||||
|
# Specify analysis parameters for variance analysis.
|
||||||
|
variance = {
|
||||||
|
"window_size": 100,
|
||||||
|
"overlap": 8
|
||||||
|
}
|
||||||
|
|
||||||
|
# Specify analysis parameters for temporal kurtosis analysis.
|
||||||
|
kurtosis = {
|
||||||
|
"window_size": 100,
|
||||||
|
"overlap": 8
|
||||||
|
}
|
||||||
|
|
||||||
|
# Specify analysis parameters for temporal skewness analysis.
|
||||||
|
skewness = {
|
||||||
|
"window_size": 100,
|
||||||
|
"overlap": 8
|
||||||
|
}
|
||||||
|
|
||||||
|
# Specify analysis parameters for FFT analysis.
|
||||||
|
fft = {
|
||||||
|
# The FFT window size determines the window size for all spectral analyses.
|
||||||
|
"window_size": 4096
|
||||||
|
}
|
||||||
|
|
||||||
|
database = {
|
||||||
|
# Enables creation of symbolic links to files not in the database rather
|
||||||
|
# than making pysical copies.
|
||||||
|
"symlink": True
|
||||||
|
}
|
||||||
|
|
||||||
|
# Sets the weighting for each analysis. a higher weighting gives an analysis
|
||||||
|
# higher presendence when finding the best matches.
|
||||||
|
matcher_weightings = {
|
||||||
|
"f0" : 0.5,
|
||||||
|
"spccntr" : 1.,
|
||||||
|
"spcsprd" : 1.,
|
||||||
|
"spcflux" : 3.,
|
||||||
|
"spccf" : 3.,
|
||||||
|
"spcflatness": 3.,
|
||||||
|
"zerox" : 1.,
|
||||||
|
"rms" : 0.1,
|
||||||
|
"peak": 0.1,
|
||||||
|
"centroid": 0.5,
|
||||||
|
"kurtosis": 2.,
|
||||||
|
"skewness": 2.,
|
||||||
|
"variance": 0.,
|
||||||
|
"harm_ratio": 2
|
||||||
|
}
|
||||||
|
|
||||||
|
# Specifies the method for averaging analysis frames to create a single value
|
||||||
|
# for comparing to other grains. Possible formatters are: 'mean', 'median',
|
||||||
|
# 'log2_mean', 'log2_median'
|
||||||
analysis_dict = {
|
analysis_dict = {
|
||||||
|
# log2_median formats using mel scale. This is useful for analyses such as
|
||||||
|
# F0.
|
||||||
"f0": "log2_median",
|
"f0": "log2_median",
|
||||||
"rms": "mean"
|
"rms": "mean",
|
||||||
|
"zerox": "mean",
|
||||||
|
"spccntr": "median",
|
||||||
|
"spcsprd": "median",
|
||||||
|
"spcflux": "median",
|
||||||
|
"spccf": "median",
|
||||||
|
"spcflatness": "median",
|
||||||
|
"peak": "mean",
|
||||||
|
"centroid": "mean",
|
||||||
|
"kurtosis": "mean",
|
||||||
|
"skewness": "mean",
|
||||||
|
"variance": "mean",
|
||||||
|
"harm_ratio": "mean"
|
||||||
}
|
}
|
||||||
|
|
||||||
analysis = {
|
analysis = {
|
||||||
|
# Force the deletion of any pre-existing analyses to create new ones. This
|
||||||
|
# is needed for overwriting old analyses generated with different
|
||||||
|
# parameters to the current ones.
|
||||||
"reanalyse": False
|
"reanalyse": False
|
||||||
}
|
}
|
||||||
|
|
||||||
|
matcher = {
|
||||||
|
# Force the re-matching of analyses
|
||||||
|
"rematch": False,
|
||||||
|
# This value must be the same as the synthesis grain size to avoid the
|
||||||
|
# speeding up or slowing down of the resulting file in relation to the
|
||||||
|
# original.
|
||||||
|
"grain_size": 100,
|
||||||
|
"overlap": 8,
|
||||||
|
# Defines the number of matches to keep for synthesis. Note that this must
|
||||||
|
# also be specified in the synthesis config
|
||||||
|
"match_quantity": 5,
|
||||||
|
# Choose the algorithm used to perform matching. kdtree is recommended for
|
||||||
|
# larger datasets.
|
||||||
|
"method": 'kdtree'
|
||||||
|
}
|
||||||
|
|
||||||
|
synthesizer = {
|
||||||
|
# Artificially scale the output grain by the difference in RMS values
|
||||||
|
# between source and target.
|
||||||
|
"enforce_intensity": True,
|
||||||
|
# Specify the ratio limit that is the grain can be scaled by.
|
||||||
|
"enf_intensity_ratio_limit": 1000.,
|
||||||
|
# Artificially modify the pitch by the difference in f0 values between
|
||||||
|
# source and target.
|
||||||
|
"enforce_f0": True,
|
||||||
|
# Specify the ratio limit that is the grain can be modified by.
|
||||||
|
"enf_f0_ratio_limit": 10.,
|
||||||
|
"grain_size": 100,
|
||||||
|
"overlap": 8,
|
||||||
|
# Normalize output, avoid clipping of final output by scaling the final
|
||||||
|
# frames.
|
||||||
|
"normalize" : True,
|
||||||
|
# Defines the number of potential grains to choose from matches when
|
||||||
|
# synthesizing output.
|
||||||
|
"match_quantity": 5
|
||||||
|
}
|
||||||
|
|
||||||
|
# Specifies the format for the output file. Changing this has not been tested
|
||||||
|
# so may produce errors/undesirable results.
|
||||||
output_file = {
|
output_file = {
|
||||||
"samplerate": 44100,
|
"samplerate": 44100,
|
||||||
"format": 131075,
|
"format": 131075,
|
||||||
"channels": 1
|
"channels": 1
|
||||||
}
|
}
|
||||||
|
|
||||||
synthesizer = {
|
|
||||||
"enforce_rms": True,
|
|
||||||
"enf_rms_ratio_limit": 5.,
|
|
||||||
"enforce_f0": True,
|
|
||||||
"enf_f0_ratio_limit": 10.,
|
|
||||||
"grain_size": 100,
|
|
||||||
"overlap": 2,
|
|
||||||
"normalize" : True,
|
|
||||||
# Defines the number of potential grains to choose from matches when
|
|
||||||
# synthesizing output.
|
|
||||||
"match_quantity": 20
|
|
||||||
}
|
|
||||||
|
|
||||||
database = {
|
|
||||||
"symlink": True
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -84,7 +84,7 @@ For this demonstration, the following file structure will be used:
|
|||||||
|-- target.03.wav
|
|-- target.03.wav
|
||||||
`-- target.04.wav
|
`-- target.04.wav
|
||||||
|
|
||||||
A source database containing a small selection of trumpet samples (aquired from
|
A source database containing a small selection of trumpet samples (acquired from
|
||||||
|
|
||||||
http://theremin.music.uiowa.edu/MIS.html) will be used to match grains with 4
|
http://theremin.music.uiowa.edu/MIS.html) will be used to match grains with 4
|
||||||
target sounds. This will produce 4 output files, one for each target sound.
|
target sounds. This will produce 4 output files, one for each target sound.
|
||||||
@@ -417,7 +417,9 @@ concatenate.py Script Flags
|
|||||||
|
|
||||||
--match_method Choose the algorithm to use when matching analyses. Available algorithms are:
|
--match_method Choose the algorithm to use when matching analyses. Available algorithms are:
|
||||||
|
|
||||||
Brute force: 'bruteforce'
|
Brute force: 'bruteforce' (BROKEN. The brute force
|
||||||
|
matcher no longer works with the current release of
|
||||||
|
this script. Use the K-d Tree Search.)
|
||||||
|
|
||||||
K-d Tree Search: 'kdtree'
|
K-d Tree Search: 'kdtree'
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user