Added sklearn to install script

2016-04-14 14:25:10 +01:00
parent d9d64e5b02
commit e35ddae1f5
6 changed files with 384 additions and 58 deletions
@@ -5,3 +5,4 @@ pip install pysndfile
 pip install h5py
 pip install https://github.com/Pezz89/fileops/zipball/master
 pip install -e ./
 pip install sklearn
@@ -1,23 +1,139 @@
 # Specify analysis parameters for root mean square analysis.
 rms = {
    # Analysis window sizes can be changed for each analysis individually.
    # These do not need to match the grain size of the matcher or synthesis.
    "window_size": 100,
-    "overlap": 2,
+    "overlap": 8,
 }
 f0 = {
    "window_size": 4096,
    "overlap": 8,
    # Currently all frames below this ratio are digaurded and left as silence.
    # Different databases will require different values for the best results.
    # Noisier databases will need lower values than more tonal databases.
    "ratio_threshold": 0.45
 }
 # Specify analysis parameters for variance analysis.
 variance = {
    "window_size": 100,
    "overlap": 8
 }
 # Specify analysis parameters for temporal kurtosis analysis.
 kurtosis = {
    "window_size": 100,
    "overlap": 8
 }
 # Specify analysis parameters for temporal skewness analysis.
 skewness = {
    "window_size": 100,
    "overlap": 8
 }
 # Specify analysis parameters for FFT analysis.
 fft = {
    # The FFT window size determines the window size for all spectral analyses.
    "window_size": 4096
 }
 database = {
    # Enables creation of symbolic links to files not in the database rather
    # than making pysical copies.
    "symlink": True
 }
 # Sets the weighting for each analysis. a higher weighting gives an analysis
 # higher presendence when finding the best matches.
 matcher_weightings = {
    "f0" : 0.5,
    "spccntr" : 1.,
    "spcsprd" : 1.,
    "spcflux" : 3.,
    "spccf" : 3.,
    "spcflatness": 3.,
    "zerox" : 1.,
    "rms" : 0.1,
    "peak": 0.1,
    "centroid": 0.5,
    "kurtosis": 2.,
    "skewness": 2.,
    "variance": 0.,
    "harm_ratio": 2
 }
 # Specifies the method for averaging analysis frames to create a single value
 # for comparing to other grains. Possible formatters are: 'mean', 'median',
 # 'log2_mean', 'log2_median'
 analysis_dict = {
    # log2_median formats using mel scale. This is useful for analyses such as
    # F0.
    "f0": "log2_median",
-    "rms": "mean"
+    "rms": "mean",
    "zerox": "mean",
    "spccntr": "median",
    "spcsprd": "median",
    "spcflux": "median",
    "spccf": "median",
    "spcflatness": "median",
    "peak": "mean",
    "centroid": "mean",
    "kurtosis": "mean",
    "skewness": "mean",
    "variance": "mean",
    "harm_ratio": "mean"
 }
 analysis = {
    # Force the deletion of any pre-existing analyses to create new ones. This
    # is needed for overwriting old analyses generated with different
    # parameters to the current ones.
    "reanalyse": False
 }
 matcher = {
    # Force the re-matching of analyses
    "rematch": False,
    # This value must be the same as the synthesis grain size to avoid the
    # speeding up or slowing down of the resulting file in relation to the
    # original.
    "grain_size": 100,
    "overlap": 8,
    # Defines the number of matches to keep for synthesis. Note that this must
    # also be specified in the synthesis config
    "match_quantity": 5,
    # Choose the algorithm used to perform matching. kdtree is recommended for
    # larger datasets.
    "method": 'kdtree'
 }
 synthesizer = {
    # Artificially scale the output grain by the difference in RMS values
    # between source and target.
    "enforce_intensity": True,
    # Specify the ratio limit that is the grain can be scaled by.
    "enf_intensity_ratio_limit": 1000.,
    # Artificially modify the pitch by the difference in f0 values between
    # source and target.
    "enforce_f0": True,
    # Specify the ratio limit that is the grain can be modified by.
    "enf_f0_ratio_limit": 10.,
    "grain_size": 100,
    "overlap": 8,
    # Normalize output, avoid clipping of final output by scaling the final
    # frames.
    "normalize" : True,
    # Defines the number of potential grains to choose from matches when
    # synthesizing output.
    "match_quantity": 5
 }
 # Specifies the format for the output file. Changing this has not been tested
 # so may produce errors/undesirable results.
 output_file = {
    "samplerate": 44100,
    "format": 131075,
    "channels": 1
 }
 database = {
    "symlink": True
 }
@@ -15,7 +15,7 @@ used to determine the central point of a signal's amplitude and is calculated
 as:
 .. math::
-    C(n) = \frac{\sum_{i=i_s(n)}^{i_e(n)}(i-i_s(n)) \cdot x(i)}{\sum_{i=i_s(n)}^{i_e(n)} \cdot x(n)}
+    C(n) = \frac{\sum_{i=i_s(n)}^{i_e(n)}(i-i_s(n)) \cdot x(i)}{\sum_{i=i_s(n)}^{i_e(n)} \cdot x(n)}.
 Ref: :cite:`lerch2012itaca`
@@ -34,10 +34,10 @@ defined as:
 .. math::
    R_n(m) = \sum_{i=i_s(n)}^{i_e(n)} x(i) x(i-m)
-Then normalizing:
+then normalizing:
 .. math::
-    \Gamma_n(m) = \frac{R_n(m)}{\sqrt{\sum_{i=i_s(n)}^{i_e(n)}x(i)^2 \sum_{i=i_s(n)}^{i_e(n)}x(i-m)^2}}
+    \Gamma_n(m) = \frac{R_n(m)}{\sqrt{\sum_{i=i_s(n)}^{i_e(n)}x(i)^2 \sum_{i=i_s(n)}^{i_e(n)}x(i-m)^2}}.
 The fundamental period of the signal is then calculated as the point between
 :math:`T_{min}` and :math:`T_{max}` at which the correlated signal most closely matches the
@@ -45,7 +45,7 @@ original. :math:`T_{min}` and :math:`T_{max}` are defined as the minimum and max
 the fundamental period.
 .. math::
-    y = arg\,max_{T_{min} \leq m \leq T_{max}} \{\Gamma_i(m)\}
+    y = arg\,max_{T_{min} \leq m \leq T_{max}} \{\Gamma_i(m)\}.
 In order to improve the accuracy of peak detection, parabolic interpolation is
 used to estimate the peak's location with greater accuracy by using the peak
@@ -66,7 +66,7 @@ Ref: :cite:`smith2011sasp`
 From this, the fundamental period the frequency is then calculated as:
 .. math::
-    f_0^n = \frac{1}{T_0^n}
+    f_0^n = \frac{1}{T_0^n}.
 Ref: :cite:`itaa2014`
@@ -83,7 +83,7 @@ the signal. The calculation of the STFT is defined as:
 .. math::
    X(k,n) = \sum_{i=i_s(n)}^{i_e(n)} x(i) \exp{\Big(-jk \cdot (i -
-    i_s(n))\frac{2\pi}{K}\Big)}
+    i_s(n))\frac{2\pi}{K}\Big)}.
 Ref: :cite:`lerch2012itaca`
@@ -96,7 +96,7 @@ of confidence measure in determining the validity of F0 values. It is
 calculated as part of the F0 estimation algorithm as:
 .. math::
-    HR(n) = max_{T_{min} \leq m \leq T_{max}}{\{T_n(m)\}}
+    HR(n) = max_{T_{min} \leq m \leq T_{max}}{\{T_n(m)\}}.
 Ref: :cite:`lerch2012itaca`
@@ -107,7 +107,7 @@ values indicate a flatter distribution and positive values indicate a more
 "peaky" distribution. Kurtosis is calculated as:
 .. math::
-    TK(n)=\frac{1}{\sigma_x^4(n) \cdot K}\sum_{i=i_s(n)}^{i_e(n)}\Big(x(i)-\mu_x(n)\Big)^4-3
+    TK(n)=\frac{1}{\sigma_x^4(n) \cdot K}\sum_{i=i_s(n)}^{i_e(n)}\Big(x(i)-\mu_x(n)\Big)^4-3.
 Ref: :cite:`lerch2012itaca`
@@ -117,16 +117,17 @@ Peak amplitude measures the highest peak in the absolute signal. It is
 calculated as:
 .. math::
-    P(n) = \max_{i_s(n) \leq i \leq i_e(n)}\{\left|x(i)\right|\}
+    P(n) = \max_{i_s(n) \leq i \leq i_e(n)}\{\left|x(i)\right|\}.
 RMS
 ~~~
 The perceived loudness of a signal is an important feature as it can be related
 to the dynamics of the signal.  RMS is used as a measure of sound intensity and
-is used for distinguishing between loud and quiet audio. It is calculated as:
+is used for distinguishing between loud and quiet audio. It is calculated as,
 where $K$ is the total number of samples:
 .. math::
-    RMS(n) = \sqrt{\frac{1}{K} \sum_{i=i_s(n)}^{i_e(n)} x(i)^2}
+    RMS(n) = \sqrt{\frac{1}{K} \sum_{i=i_s(n)}^{i_e(n)} x(i)^2}.
 Other methods that take the human perception of loudness into account may
 provide more perceptually relevant results. However the RMS measurement
@@ -142,7 +143,7 @@ values indicate that the spectral content is centred in higher frequencies and
 lower value indicate a lower centre. The spectral centroid is calculated as:
 .. math::
-    SC(n) = \frac{\sum_{k=0}^{K/2-1} k \cdot | X(k,n) | ^2}{\sum_{k=0}^{K/2-1} | X(k,n) | ^2}
+    SC(n) = \frac{\sum_{k=0}^{K/2-1} k \cdot | X(k,n) | ^2}{\sum_{k=0}^{K/2-1} | X(k,n) | ^2}.
 The result is the sum of magnitudes, weighted by their index, normalized by the
 unweighted sum.
@@ -158,7 +159,7 @@ This differentiates between flat spectrums and sinusoidal spectrums. (low values
 representing the former and high values representing the latter.)
 .. math::
-    SCF = \frac{ \max_{0 \leq k \leq K/2-1} \{| X(k,n) | \}}{\sum_{k=0}^{K/2-1} | X(k,n) | }
+    SCF = \frac{ \max_{0 \leq k \leq K/2-1} \{| X(k,n) | \}}{\sum_{k=0}^{K/2-1} | X(k,n) | }.
 Ref: :cite:`lerch2012itaca`
@@ -171,7 +172,7 @@ values that represent a more tonal signal. Spectral flatness is calculated as:
 .. math::
    TFl(n) = \frac{\sqrt[K/2]{\prod_{k=0}^{K/2-1} | X(k,n) | }}{2/K \cdot
-    \sum_{k=0}^{K/2-1} | X(k,n) | }
+    \sum_{k=0}^{K/2-1} | X(k,n) | }.
 Ref: :cite:`lerch2012itaca`
@@ -184,7 +185,7 @@ similar frames (that suggests a steady state signal). It is calculated as:
 .. math::
    SF(n) = \frac{\sqrt{\sum_{k=0}^{K/2-1} \Big( | X(k,n) | - | X(k,n-1) | \Big)^2
-    }}{K/2}
+    }}{K/2}.
 Ref: :cite:`lerch2012itaca`
@@ -196,7 +197,7 @@ and is associated with perceptions of timbre. It is calculated as:
 .. math::
    SS(n) = \sqrt{\frac{\sum_{k=0}^{K/2-1} \Big(k-SC(n)\Big)^2 \cdot | X(k,n)
-    | ^2}{\sum_{k=0}^{K/2-1} | X(k,n) | ^2}}
+    | ^2}{\sum_{k=0}^{K/2-1} | X(k,n) | ^2}}.
 Ref: :cite:`lerch2012itaca`
@@ -206,7 +207,7 @@ The variance of a signal measures it's spread around the signal's arithmetic
 mean. It is used in the calculation of Kurtosis and is calculated as:
 .. math::
-    \sigma_x^2 = \frac{1}{K} \sum_{i=i_s(n)}^{i_e(n)}(x(i) - \mu_x(n))^2    
+    \sigma_x^2 = \frac{1}{K} \sum_{i=i_s(n)}^{i_e(n)}(x(i) - \mu_x(n))^2.
 Ref: :cite:`lerch2012itaca`
@@ -218,7 +219,7 @@ a signal, as noisy signals will pass from positive to negative more frequently
 than period signals. It is calculated as:
 .. math::
-    Z(n) = \frac{1}{2K} \sum_{i=i_s(n)}^{i_e(n)} | sgn[x(i)] - sgn[x(i-1)] |
+    Z(n) = \frac{1}{2K} \sum_{i=i_s(n)}^{i_e(n)} | sgn[x(i)] - sgn[x(i-1)] | 
    \text{Where the sgn function is defined as:}
@@ -1,36 +1,139 @@
 # Specify analysis parameters for root mean square analysis.
 rms = {
    # Analysis window sizes can be changed for each analysis individually.
    # These do not need to match the grain size of the matcher or synthesis.
    "window_size": 100,
-    "overlap": 2,
+    "overlap": 8,
 }
-analysis_dict = {
+f0 = {
-    "f0": "log2_median",
+    "window_size": 4096,
-    "rms": "mean"
+    "overlap": 8,
    # Currently all frames below this ratio are digaurded and left as silence.
    # Different databases will require different values for the best results.
    # Noisier databases will need lower values than more tonal databases.
    "ratio_threshold": 0.45
 }
 # Specify analysis parameters for variance analysis.
 variance = {
    "window_size": 100,
    "overlap": 8
 }
 # Specify analysis parameters for temporal kurtosis analysis.
 kurtosis = {
    "window_size": 100,
    "overlap": 8
 }
 # Specify analysis parameters for temporal skewness analysis.
 skewness = {
    "window_size": 100,
    "overlap": 8
 }
 # Specify analysis parameters for FFT analysis.
 fft = {
    # The FFT window size determines the window size for all spectral analyses.
    "window_size": 4096
 }
 database = {
    # Enables creation of symbolic links to files not in the database rather
    # than making pysical copies.
    "symlink": True
 }
 # Sets the weighting for each analysis. a higher weighting gives an analysis
 # higher presendence when finding the best matches.
 matcher_weightings = {
-    "f0" : 1.,
+    "f0" : 0.5,
-    "rms": 1.
+    "spccntr" : 1.,
    "spcsprd" : 1.,
    "spcflux" : 3.,
    "spccf" : 3.,
    "spcflatness": 3.,
    "zerox" : 1.,
    "rms" : 0.1,
    "peak": 0.1,
    "centroid": 0.5,
    "kurtosis": 2.,
    "skewness": 2.,
    "variance": 0.,
    "harm_ratio": 2
 }
 # Specifies the method for averaging analysis frames to create a single value
 # for comparing to other grains. Possible formatters are: 'mean', 'median',
 # 'log2_mean', 'log2_median'
 analysis_dict = {
    # log2_median formats using mel scale. This is useful for analyses such as
    # F0.
    "f0": "log2_median",
    "rms": "mean",
    "zerox": "mean",
    "spccntr": "median",
    "spcsprd": "median",
    "spcflux": "median",
    "spccf": "median",
    "spcflatness": "median",
    "peak": "mean",
    "centroid": "mean",
    "kurtosis": "mean",
    "skewness": "mean",
    "variance": "mean",
    "harm_ratio": "mean"
 }
 analysis = {
    # Force the deletion of any pre-existing analyses to create new ones. This
    # is needed for overwriting old analyses generated with different
    # parameters to the current ones.
    "reanalyse": False
 }
 matcher = {
    # Force the re-matching of analyses
    "rematch": False,
    # This value must be the same as the synthesis grain size to avoid the
    # speeding up or slowing down of the resulting file in relation to the
    # original.
    "grain_size": 100,
-    "overlap": 2,
+    "overlap": 8,
-    # Defines the number of matches to keep for synthesis.
+    # Defines the number of matches to keep for synthesis. Note that this must
-    "match_quantity": 20
+    # also be specified in the synthesis config
    "match_quantity": 5,
    # Choose the algorithm used to perform matching. kdtree is recommended for
    # larger datasets.
    "method": 'kdtree'
 }
 synthesizer = {
    # Artificially scale the output grain by the difference in RMS values
    # between source and target.
    "enforce_intensity": True,
    # Specify the ratio limit that is the grain can be scaled by.
    "enf_intensity_ratio_limit": 1000.,
    # Artificially modify the pitch by the difference in f0 values between
    # source and target.
    "enforce_f0": True,
    # Specify the ratio limit that is the grain can be modified by.
    "enf_f0_ratio_limit": 10.,
    "grain_size": 100,
    "overlap": 8,
    # Normalize output, avoid clipping of final output by scaling the final
    # frames.
    "normalize" : True,
    # Defines the number of potential grains to choose from matches when
    # synthesizing output.
    "match_quantity": 5
 }
 # Specifies the format for the output file. Changing this has not been tested
 # so may produce errors/undesirable results.
 output_file = {
    "samplerate": 44100,
    "format": 131075,
    "channels": 1
 }
 database = {
    "symlink": True
 }
@@ -1,36 +1,139 @@
 # Specify analysis parameters for root mean square analysis.
 rms = {
    # Analysis window sizes can be changed for each analysis individually.
    # These do not need to match the grain size of the matcher or synthesis.
    "window_size": 100,
-    "overlap": 2,
+    "overlap": 8,
 }
 f0 = {
    "window_size": 4096,
    "overlap": 8,
    # Currently all frames below this ratio are digaurded and left as silence.
    # Different databases will require different values for the best results.
    # Noisier databases will need lower values than more tonal databases.
    "ratio_threshold": 0.45
 }
 # Specify analysis parameters for variance analysis.
 variance = {
    "window_size": 100,
    "overlap": 8
 }
 # Specify analysis parameters for temporal kurtosis analysis.
 kurtosis = {
    "window_size": 100,
    "overlap": 8
 }
 # Specify analysis parameters for temporal skewness analysis.
 skewness = {
    "window_size": 100,
    "overlap": 8
 }
 # Specify analysis parameters for FFT analysis.
 fft = {
    # The FFT window size determines the window size for all spectral analyses.
    "window_size": 4096
 }
 database = {
    # Enables creation of symbolic links to files not in the database rather
    # than making pysical copies.
    "symlink": True
 }
 # Sets the weighting for each analysis. a higher weighting gives an analysis
 # higher presendence when finding the best matches.
 matcher_weightings = {
    "f0" : 0.5,
    "spccntr" : 1.,
    "spcsprd" : 1.,
    "spcflux" : 3.,
    "spccf" : 3.,
    "spcflatness": 3.,
    "zerox" : 1.,
    "rms" : 0.1,
    "peak": 0.1,
    "centroid": 0.5,
    "kurtosis": 2.,
    "skewness": 2.,
    "variance": 0.,
    "harm_ratio": 2
 }
 # Specifies the method for averaging analysis frames to create a single value
 # for comparing to other grains. Possible formatters are: 'mean', 'median',
 # 'log2_mean', 'log2_median'
 analysis_dict = {
    # log2_median formats using mel scale. This is useful for analyses such as
    # F0.
    "f0": "log2_median",
-    "rms": "mean"
+    "rms": "mean",
    "zerox": "mean",
    "spccntr": "median",
    "spcsprd": "median",
    "spcflux": "median",
    "spccf": "median",
    "spcflatness": "median",
    "peak": "mean",
    "centroid": "mean",
    "kurtosis": "mean",
    "skewness": "mean",
    "variance": "mean",
    "harm_ratio": "mean"
 }
 analysis = {
    # Force the deletion of any pre-existing analyses to create new ones. This
    # is needed for overwriting old analyses generated with different
    # parameters to the current ones.
    "reanalyse": False
 }
 matcher = {
    # Force the re-matching of analyses
    "rematch": False,
    # This value must be the same as the synthesis grain size to avoid the
    # speeding up or slowing down of the resulting file in relation to the
    # original.
    "grain_size": 100,
    "overlap": 8,
    # Defines the number of matches to keep for synthesis. Note that this must
    # also be specified in the synthesis config
    "match_quantity": 5,
    # Choose the algorithm used to perform matching. kdtree is recommended for
    # larger datasets.
    "method": 'kdtree'
 }
 synthesizer = {
    # Artificially scale the output grain by the difference in RMS values
    # between source and target.
    "enforce_intensity": True,
    # Specify the ratio limit that is the grain can be scaled by.
    "enf_intensity_ratio_limit": 1000.,
    # Artificially modify the pitch by the difference in f0 values between
    # source and target.
    "enforce_f0": True,
    # Specify the ratio limit that is the grain can be modified by.
    "enf_f0_ratio_limit": 10.,
    "grain_size": 100,
    "overlap": 8,
    # Normalize output, avoid clipping of final output by scaling the final
    # frames.
    "normalize" : True,
    # Defines the number of potential grains to choose from matches when
    # synthesizing output.
    "match_quantity": 5
 }
 # Specifies the format for the output file. Changing this has not been tested
 # so may produce errors/undesirable results.
 output_file = {
    "samplerate": 44100,
    "format": 131075,
    "channels": 1
 }
 synthesizer = {
    "enforce_rms": True,
    "enf_rms_ratio_limit": 5.,
    "enforce_f0": True,
    "enf_f0_ratio_limit": 10.,
    "grain_size": 100,
    "overlap": 2,
    "normalize" : True,
    # Defines the number of potential grains to choose from matches when
    # synthesizing output.
    "match_quantity": 20
 }
 database = {
    "symlink": True
 }
@@ -84,7 +84,7 @@ For this demonstration, the following file structure will be used:
        |-- target.03.wav
        `-- target.04.wav
-A source database containing a small selection of trumpet samples (aquired from
+A source database containing a small selection of trumpet samples (acquired from
 http://theremin.music.uiowa.edu/MIS.html) will be used to match grains with 4
 target sounds. This will produce 4 output files, one for each target sound.
@@ -417,7 +417,9 @@ concatenate.py Script Flags
 --match_method        Choose the algorithm to use when matching analyses. Available algorithms are:
-                         Brute force: 'bruteforce'
+                         Brute force: 'bruteforce' (BROKEN. The brute force
                         matcher no longer works with the current release of
                         this script. Use the K-d Tree Search.)
                         K-d Tree Search: 'kdtree'