Added sklearn to install script

2016-04-14 14:25:10 +01:00
parent d9d64e5b02
commit e35ddae1f5
6 changed files with 384 additions and 58 deletions
@@ -5,3 +5,4 @@ pip install pysndfile
 pip install h5py
 pip install https://github.com/Pezz89/fileops/zipball/master
 pip install -e ./
+pip install sklearn
@@ -1,23 +1,139 @@
+# Specify analysis parameters for root mean square analysis.
 rms = {
+    # Analysis window sizes can be changed for each analysis individually.
+    # These do not need to match the grain size of the matcher or synthesis.
    "window_size": 100,
-    "overlap": 2,
+    "overlap": 8,
 }

+f0 = {
+    "window_size": 4096,
+    "overlap": 8,
+    # Currently all frames below this ratio are digaurded and left as silence.
+    # Different databases will require different values for the best results.
+    # Noisier databases will need lower values than more tonal databases.
+    "ratio_threshold": 0.45
+}
+
+# Specify analysis parameters for variance analysis.
+variance = {
+    "window_size": 100,
+    "overlap": 8
+}
+
+# Specify analysis parameters for temporal kurtosis analysis.
+kurtosis = {
+    "window_size": 100,
+    "overlap": 8
+}
+
+# Specify analysis parameters for temporal skewness analysis.
+skewness = {
+    "window_size": 100,
+    "overlap": 8
+}
+
+# Specify analysis parameters for FFT analysis.
+fft = {
+    # The FFT window size determines the window size for all spectral analyses.
+    "window_size": 4096
+}
+
+database = {
+    # Enables creation of symbolic links to files not in the database rather
+    # than making pysical copies.
+    "symlink": True
+}
+
+# Sets the weighting for each analysis. a higher weighting gives an analysis
+# higher presendence when finding the best matches.
+matcher_weightings = {
+    "f0" : 0.5,
+    "spccntr" : 1.,
+    "spcsprd" : 1.,
+    "spcflux" : 3.,
+    "spccf" : 3.,
+    "spcflatness": 3.,
+    "zerox" : 1.,
+    "rms" : 0.1,
+    "peak": 0.1,
+    "centroid": 0.5,
+    "kurtosis": 2.,
+    "skewness": 2.,
+    "variance": 0.,
+    "harm_ratio": 2
+}
+
+# Specifies the method for averaging analysis frames to create a single value
+# for comparing to other grains. Possible formatters are: 'mean', 'median',
+# 'log2_mean', 'log2_median'
 analysis_dict = {
+    # log2_median formats using mel scale. This is useful for analyses such as
+    # F0.
    "f0": "log2_median",
-    "rms": "mean"
+    "rms": "mean",
+    "zerox": "mean",
+    "spccntr": "median",
+    "spcsprd": "median",
+    "spcflux": "median",
+    "spccf": "median",
+    "spcflatness": "median",
+    "peak": "mean",
+    "centroid": "mean",
+    "kurtosis": "mean",
+    "skewness": "mean",
+    "variance": "mean",
+    "harm_ratio": "mean"
 }

 analysis = {
+    # Force the deletion of any pre-existing analyses to create new ones. This
+    # is needed for overwriting old analyses generated with different
+    # parameters to the current ones.
    "reanalyse": False
 }

+matcher = {
+    # Force the re-matching of analyses
+    "rematch": False,
+    # This value must be the same as the synthesis grain size to avoid the
+    # speeding up or slowing down of the resulting file in relation to the
+    # original.
+    "grain_size": 100,
+    "overlap": 8,
+    # Defines the number of matches to keep for synthesis. Note that this must
+    # also be specified in the synthesis config
+    "match_quantity": 5,
+    # Choose the algorithm used to perform matching. kdtree is recommended for
+    # larger datasets.
+    "method": 'kdtree'
+}
+
+synthesizer = {
+    # Artificially scale the output grain by the difference in RMS values
+    # between source and target.
+    "enforce_intensity": True,
+    # Specify the ratio limit that is the grain can be scaled by.
+    "enf_intensity_ratio_limit": 1000.,
+    # Artificially modify the pitch by the difference in f0 values between
+    # source and target.
+    "enforce_f0": True,
+    # Specify the ratio limit that is the grain can be modified by.
+    "enf_f0_ratio_limit": 10.,
+    "grain_size": 100,
+    "overlap": 8,
+    # Normalize output, avoid clipping of final output by scaling the final
+    # frames.
+    "normalize" : True,
+    # Defines the number of potential grains to choose from matches when
+    # synthesizing output.
+    "match_quantity": 5
+}
+
+# Specifies the format for the output file. Changing this has not been tested
+# so may produce errors/undesirable results.
 output_file = {
    "samplerate": 44100,
    "format": 131075,
    "channels": 1
 }
-
-database = {
-    "symlink": True
-}
@@ -15,7 +15,7 @@ used to determine the central point of a signal's amplitude and is calculated
 as:

 .. math::
-    C(n) = \frac{\sum_{i=i_s(n)}^{i_e(n)}(i-i_s(n)) \cdot x(i)}{\sum_{i=i_s(n)}^{i_e(n)} \cdot x(n)}
+    C(n) = \frac{\sum_{i=i_s(n)}^{i_e(n)}(i-i_s(n)) \cdot x(i)}{\sum_{i=i_s(n)}^{i_e(n)} \cdot x(n)}.

 Ref: :cite:`lerch2012itaca`

@@ -34,10 +34,10 @@ defined as:
 .. math::
    R_n(m) = \sum_{i=i_s(n)}^{i_e(n)} x(i) x(i-m)

-Then normalizing:
+then normalizing:

 .. math::
-    \Gamma_n(m) = \frac{R_n(m)}{\sqrt{\sum_{i=i_s(n)}^{i_e(n)}x(i)^2 \sum_{i=i_s(n)}^{i_e(n)}x(i-m)^2}}
+    \Gamma_n(m) = \frac{R_n(m)}{\sqrt{\sum_{i=i_s(n)}^{i_e(n)}x(i)^2 \sum_{i=i_s(n)}^{i_e(n)}x(i-m)^2}}.

 The fundamental period of the signal is then calculated as the point between
 :math:`T_{min}` and :math:`T_{max}` at which the correlated signal most closely matches the
@@ -45,7 +45,7 @@ original. :math:`T_{min}` and :math:`T_{max}` are defined as the minimum and max
 the fundamental period.

 .. math::
-    y = arg\,max_{T_{min} \leq m \leq T_{max}} \{\Gamma_i(m)\}
+    y = arg\,max_{T_{min} \leq m \leq T_{max}} \{\Gamma_i(m)\}.

 In order to improve the accuracy of peak detection, parabolic interpolation is
 used to estimate the peak's location with greater accuracy by using the peak
@@ -66,7 +66,7 @@ Ref: :cite:`smith2011sasp`
 From this, the fundamental period the frequency is then calculated as:

 .. math::
-    f_0^n = \frac{1}{T_0^n}
+    f_0^n = \frac{1}{T_0^n}.

 Ref: :cite:`itaa2014`

@@ -83,7 +83,7 @@ the signal. The calculation of the STFT is defined as:

 .. math::
    X(k,n) = \sum_{i=i_s(n)}^{i_e(n)} x(i) \exp{\Big(-jk \cdot (i -
-    i_s(n))\frac{2\pi}{K}\Big)}
+    i_s(n))\frac{2\pi}{K}\Big)}.

 Ref: :cite:`lerch2012itaca`

@@ -96,7 +96,7 @@ of confidence measure in determining the validity of F0 values. It is
 calculated as part of the F0 estimation algorithm as:

 .. math::
-    HR(n) = max_{T_{min} \leq m \leq T_{max}}{\{T_n(m)\}}
+    HR(n) = max_{T_{min} \leq m \leq T_{max}}{\{T_n(m)\}}.

 Ref: :cite:`lerch2012itaca`

@@ -107,7 +107,7 @@ values indicate a flatter distribution and positive values indicate a more
 "peaky" distribution. Kurtosis is calculated as:

 .. math::
-    TK(n)=\frac{1}{\sigma_x^4(n) \cdot K}\sum_{i=i_s(n)}^{i_e(n)}\Big(x(i)-\mu_x(n)\Big)^4-3
+    TK(n)=\frac{1}{\sigma_x^4(n) \cdot K}\sum_{i=i_s(n)}^{i_e(n)}\Big(x(i)-\mu_x(n)\Big)^4-3.

 Ref: :cite:`lerch2012itaca`

@@ -117,16 +117,17 @@ Peak amplitude measures the highest peak in the absolute signal. It is
 calculated as:

 .. math::
-    P(n) = \max_{i_s(n) \leq i \leq i_e(n)}\{\left|x(i)\right|\}
+    P(n) = \max_{i_s(n) \leq i \leq i_e(n)}\{\left|x(i)\right|\}.

 RMS
 ~~~
 The perceived loudness of a signal is an important feature as it can be related
 to the dynamics of the signal.  RMS is used as a measure of sound intensity and
-is used for distinguishing between loud and quiet audio. It is calculated as:
+is used for distinguishing between loud and quiet audio. It is calculated as,
+where $K$ is the total number of samples:

 .. math::
-    RMS(n) = \sqrt{\frac{1}{K} \sum_{i=i_s(n)}^{i_e(n)} x(i)^2}
+    RMS(n) = \sqrt{\frac{1}{K} \sum_{i=i_s(n)}^{i_e(n)} x(i)^2}.

 Other methods that take the human perception of loudness into account may
 provide more perceptually relevant results. However the RMS measurement
@@ -142,7 +143,7 @@ values indicate that the spectral content is centred in higher frequencies and
 lower value indicate a lower centre. The spectral centroid is calculated as:

 .. math::
-    SC(n) = \frac{\sum_{k=0}^{K/2-1} k \cdot | X(k,n) | ^2}{\sum_{k=0}^{K/2-1} | X(k,n) | ^2}
+    SC(n) = \frac{\sum_{k=0}^{K/2-1} k \cdot | X(k,n) | ^2}{\sum_{k=0}^{K/2-1} | X(k,n) | ^2}.

 The result is the sum of magnitudes, weighted by their index, normalized by the
 unweighted sum.
@@ -158,7 +159,7 @@ This differentiates between flat spectrums and sinusoidal spectrums. (low values
 representing the former and high values representing the latter.)

 .. math::
-    SCF = \frac{ \max_{0 \leq k \leq K/2-1} \{| X(k,n) | \}}{\sum_{k=0}^{K/2-1} | X(k,n) | }
+    SCF = \frac{ \max_{0 \leq k \leq K/2-1} \{| X(k,n) | \}}{\sum_{k=0}^{K/2-1} | X(k,n) | }.

 Ref: :cite:`lerch2012itaca`

@@ -171,7 +172,7 @@ values that represent a more tonal signal. Spectral flatness is calculated as:

 .. math::
    TFl(n) = \frac{\sqrt[K/2]{\prod_{k=0}^{K/2-1} | X(k,n) | }}{2/K \cdot
-    \sum_{k=0}^{K/2-1} | X(k,n) | }
+    \sum_{k=0}^{K/2-1} | X(k,n) | }.

 Ref: :cite:`lerch2012itaca`

@@ -184,7 +185,7 @@ similar frames (that suggests a steady state signal). It is calculated as:

 .. math::
    SF(n) = \frac{\sqrt{\sum_{k=0}^{K/2-1} \Big( | X(k,n) | - | X(k,n-1) | \Big)^2
-    }}{K/2}
+    }}{K/2}.

 Ref: :cite:`lerch2012itaca`

@@ -196,7 +197,7 @@ and is associated with perceptions of timbre. It is calculated as:

 .. math::
    SS(n) = \sqrt{\frac{\sum_{k=0}^{K/2-1} \Big(k-SC(n)\Big)^2 \cdot | X(k,n)
-    | ^2}{\sum_{k=0}^{K/2-1} | X(k,n) | ^2}}
+    | ^2}{\sum_{k=0}^{K/2-1} | X(k,n) | ^2}}.

 Ref: :cite:`lerch2012itaca`

@@ -206,7 +207,7 @@ The variance of a signal measures it's spread around the signal's arithmetic
 mean. It is used in the calculation of Kurtosis and is calculated as:

 .. math::
-    \sigma_x^2 = \frac{1}{K} \sum_{i=i_s(n)}^{i_e(n)}(x(i) - \mu_x(n))^2    
+    \sigma_x^2 = \frac{1}{K} \sum_{i=i_s(n)}^{i_e(n)}(x(i) - \mu_x(n))^2.

 Ref: :cite:`lerch2012itaca`

@@ -1,36 +1,139 @@
+# Specify analysis parameters for root mean square analysis.
 rms = {
+    # Analysis window sizes can be changed for each analysis individually.
+    # These do not need to match the grain size of the matcher or synthesis.
    "window_size": 100,
-    "overlap": 2,
+    "overlap": 8,
 }

-analysis_dict = {
-    "f0": "log2_median",
-    "rms": "mean"
+f0 = {
+    "window_size": 4096,
+    "overlap": 8,
+    # Currently all frames below this ratio are digaurded and left as silence.
+    # Different databases will require different values for the best results.
+    # Noisier databases will need lower values than more tonal databases.
+    "ratio_threshold": 0.45
 }

+# Specify analysis parameters for variance analysis.
+variance = {
+    "window_size": 100,
+    "overlap": 8
+}
+
+# Specify analysis parameters for temporal kurtosis analysis.
+kurtosis = {
+    "window_size": 100,
+    "overlap": 8
+}
+
+# Specify analysis parameters for temporal skewness analysis.
+skewness = {
+    "window_size": 100,
+    "overlap": 8
+}
+
+# Specify analysis parameters for FFT analysis.
+fft = {
+    # The FFT window size determines the window size for all spectral analyses.
+    "window_size": 4096
+}
+
+database = {
+    # Enables creation of symbolic links to files not in the database rather
+    # than making pysical copies.
+    "symlink": True
+}
+
+# Sets the weighting for each analysis. a higher weighting gives an analysis
+# higher presendence when finding the best matches.
 matcher_weightings = {
-    "f0" : 1.,
-    "rms": 1.
+    "f0" : 0.5,
+    "spccntr" : 1.,
+    "spcsprd" : 1.,
+    "spcflux" : 3.,
+    "spccf" : 3.,
+    "spcflatness": 3.,
+    "zerox" : 1.,
+    "rms" : 0.1,
+    "peak": 0.1,
+    "centroid": 0.5,
+    "kurtosis": 2.,
+    "skewness": 2.,
+    "variance": 0.,
+    "harm_ratio": 2
+}
+
+# Specifies the method for averaging analysis frames to create a single value
+# for comparing to other grains. Possible formatters are: 'mean', 'median',
+# 'log2_mean', 'log2_median'
+analysis_dict = {
+    # log2_median formats using mel scale. This is useful for analyses such as
+    # F0.
+    "f0": "log2_median",
+    "rms": "mean",
+    "zerox": "mean",
+    "spccntr": "median",
+    "spcsprd": "median",
+    "spcflux": "median",
+    "spccf": "median",
+    "spcflatness": "median",
+    "peak": "mean",
+    "centroid": "mean",
+    "kurtosis": "mean",
+    "skewness": "mean",
+    "variance": "mean",
+    "harm_ratio": "mean"
 }

 analysis = {
+    # Force the deletion of any pre-existing analyses to create new ones. This
+    # is needed for overwriting old analyses generated with different
+    # parameters to the current ones.
    "reanalyse": False
 }

 matcher = {
+    # Force the re-matching of analyses
    "rematch": False,
+    # This value must be the same as the synthesis grain size to avoid the
+    # speeding up or slowing down of the resulting file in relation to the
+    # original.
    "grain_size": 100,
-    "overlap": 2,
-    # Defines the number of matches to keep for synthesis.
-    "match_quantity": 20
+    "overlap": 8,
+    # Defines the number of matches to keep for synthesis. Note that this must
+    # also be specified in the synthesis config
+    "match_quantity": 5,
+    # Choose the algorithm used to perform matching. kdtree is recommended for
+    # larger datasets.
+    "method": 'kdtree'
 }

+synthesizer = {
+    # Artificially scale the output grain by the difference in RMS values
+    # between source and target.
+    "enforce_intensity": True,
+    # Specify the ratio limit that is the grain can be scaled by.
+    "enf_intensity_ratio_limit": 1000.,
+    # Artificially modify the pitch by the difference in f0 values between
+    # source and target.
+    "enforce_f0": True,
+    # Specify the ratio limit that is the grain can be modified by.
+    "enf_f0_ratio_limit": 10.,
+    "grain_size": 100,
+    "overlap": 8,
+    # Normalize output, avoid clipping of final output by scaling the final
+    # frames.
+    "normalize" : True,
+    # Defines the number of potential grains to choose from matches when
+    # synthesizing output.
+    "match_quantity": 5
+}
+
+# Specifies the format for the output file. Changing this has not been tested
+# so may produce errors/undesirable results.
 output_file = {
    "samplerate": 44100,
    "format": 131075,
    "channels": 1
 }
-
-database = {
-    "symlink": True
-}
@@ -1,36 +1,139 @@
+# Specify analysis parameters for root mean square analysis.
 rms = {
+    # Analysis window sizes can be changed for each analysis individually.
+    # These do not need to match the grain size of the matcher or synthesis.
    "window_size": 100,
-    "overlap": 2,
+    "overlap": 8,
 }

+f0 = {
+    "window_size": 4096,
+    "overlap": 8,
+    # Currently all frames below this ratio are digaurded and left as silence.
+    # Different databases will require different values for the best results.
+    # Noisier databases will need lower values than more tonal databases.
+    "ratio_threshold": 0.45
+}
+
+# Specify analysis parameters for variance analysis.
+variance = {
+    "window_size": 100,
+    "overlap": 8
+}
+
+# Specify analysis parameters for temporal kurtosis analysis.
+kurtosis = {
+    "window_size": 100,
+    "overlap": 8
+}
+
+# Specify analysis parameters for temporal skewness analysis.
+skewness = {
+    "window_size": 100,
+    "overlap": 8
+}
+
+# Specify analysis parameters for FFT analysis.
+fft = {
+    # The FFT window size determines the window size for all spectral analyses.
+    "window_size": 4096
+}
+
+database = {
+    # Enables creation of symbolic links to files not in the database rather
+    # than making pysical copies.
+    "symlink": True
+}
+
+# Sets the weighting for each analysis. a higher weighting gives an analysis
+# higher presendence when finding the best matches.
+matcher_weightings = {
+    "f0" : 0.5,
+    "spccntr" : 1.,
+    "spcsprd" : 1.,
+    "spcflux" : 3.,
+    "spccf" : 3.,
+    "spcflatness": 3.,
+    "zerox" : 1.,
+    "rms" : 0.1,
+    "peak": 0.1,
+    "centroid": 0.5,
+    "kurtosis": 2.,
+    "skewness": 2.,
+    "variance": 0.,
+    "harm_ratio": 2
+}
+
+# Specifies the method for averaging analysis frames to create a single value
+# for comparing to other grains. Possible formatters are: 'mean', 'median',
+# 'log2_mean', 'log2_median'
 analysis_dict = {
+    # log2_median formats using mel scale. This is useful for analyses such as
+    # F0.
    "f0": "log2_median",
-    "rms": "mean"
+    "rms": "mean",
+    "zerox": "mean",
+    "spccntr": "median",
+    "spcsprd": "median",
+    "spcflux": "median",
+    "spccf": "median",
+    "spcflatness": "median",
+    "peak": "mean",
+    "centroid": "mean",
+    "kurtosis": "mean",
+    "skewness": "mean",
+    "variance": "mean",
+    "harm_ratio": "mean"
 }

 analysis = {
+    # Force the deletion of any pre-existing analyses to create new ones. This
+    # is needed for overwriting old analyses generated with different
+    # parameters to the current ones.
    "reanalyse": False
 }

+matcher = {
+    # Force the re-matching of analyses
+    "rematch": False,
+    # This value must be the same as the synthesis grain size to avoid the
+    # speeding up or slowing down of the resulting file in relation to the
+    # original.
+    "grain_size": 100,
+    "overlap": 8,
+    # Defines the number of matches to keep for synthesis. Note that this must
+    # also be specified in the synthesis config
+    "match_quantity": 5,
+    # Choose the algorithm used to perform matching. kdtree is recommended for
+    # larger datasets.
+    "method": 'kdtree'
+}
+
+synthesizer = {
+    # Artificially scale the output grain by the difference in RMS values
+    # between source and target.
+    "enforce_intensity": True,
+    # Specify the ratio limit that is the grain can be scaled by.
+    "enf_intensity_ratio_limit": 1000.,
+    # Artificially modify the pitch by the difference in f0 values between
+    # source and target.
+    "enforce_f0": True,
+    # Specify the ratio limit that is the grain can be modified by.
+    "enf_f0_ratio_limit": 10.,
+    "grain_size": 100,
+    "overlap": 8,
+    # Normalize output, avoid clipping of final output by scaling the final
+    # frames.
+    "normalize" : True,
+    # Defines the number of potential grains to choose from matches when
+    # synthesizing output.
+    "match_quantity": 5
+}
+
+# Specifies the format for the output file. Changing this has not been tested
+# so may produce errors/undesirable results.
 output_file = {
    "samplerate": 44100,
    "format": 131075,
    "channels": 1
 }
-
-synthesizer = {
-    "enforce_rms": True,
-    "enf_rms_ratio_limit": 5.,
-    "enforce_f0": True,
-    "enf_f0_ratio_limit": 10.,
-    "grain_size": 100,
-    "overlap": 2,
-    "normalize" : True,
-    # Defines the number of potential grains to choose from matches when
-    # synthesizing output.
-    "match_quantity": 20
-}
-
-database = {
-    "symlink": True
-}
@@ -84,7 +84,7 @@ For this demonstration, the following file structure will be used:
        |-- target.03.wav
        `-- target.04.wav

-A source database containing a small selection of trumpet samples (aquired from
+A source database containing a small selection of trumpet samples (acquired from

 http://theremin.music.uiowa.edu/MIS.html) will be used to match grains with 4
 target sounds. This will produce 4 output files, one for each target sound.
@@ -417,7 +417,9 @@ concatenate.py Script Flags

 --match_method        Choose the algorithm to use when matching analyses. Available algorithms are:

-                         Brute force: 'bruteforce'
+                         Brute force: 'bruteforce' (BROKEN. The brute force
+                         matcher no longer works with the current release of
+                         this script. Use the K-d Tree Search.)

                         K-d Tree Search: 'kdtree'