diff --git a/install.sh b/install.sh index edbebcc..642ee49 100755 --- a/install.sh +++ b/install.sh @@ -5,3 +5,4 @@ pip install pysndfile pip install h5py pip install https://github.com/Pezz89/fileops/zipball/master pip install -e ./ +pip install sklearn diff --git a/src/sppysound/docs/analysis_config.py b/src/sppysound/docs/analysis_config.py index 99fc036..9300559 100644 --- a/src/sppysound/docs/analysis_config.py +++ b/src/sppysound/docs/analysis_config.py @@ -1,23 +1,139 @@ +# Specify analysis parameters for root mean square analysis. rms = { + # Analysis window sizes can be changed for each analysis individually. + # These do not need to match the grain size of the matcher or synthesis. "window_size": 100, - "overlap": 2, + "overlap": 8, } +f0 = { + "window_size": 4096, + "overlap": 8, + # Currently all frames below this ratio are digaurded and left as silence. + # Different databases will require different values for the best results. + # Noisier databases will need lower values than more tonal databases. + "ratio_threshold": 0.45 +} + +# Specify analysis parameters for variance analysis. +variance = { + "window_size": 100, + "overlap": 8 +} + +# Specify analysis parameters for temporal kurtosis analysis. +kurtosis = { + "window_size": 100, + "overlap": 8 +} + +# Specify analysis parameters for temporal skewness analysis. +skewness = { + "window_size": 100, + "overlap": 8 +} + +# Specify analysis parameters for FFT analysis. +fft = { + # The FFT window size determines the window size for all spectral analyses. + "window_size": 4096 +} + +database = { + # Enables creation of symbolic links to files not in the database rather + # than making pysical copies. + "symlink": True +} + +# Sets the weighting for each analysis. a higher weighting gives an analysis +# higher presendence when finding the best matches. +matcher_weightings = { + "f0" : 0.5, + "spccntr" : 1., + "spcsprd" : 1., + "spcflux" : 3., + "spccf" : 3., + "spcflatness": 3., + "zerox" : 1., + "rms" : 0.1, + "peak": 0.1, + "centroid": 0.5, + "kurtosis": 2., + "skewness": 2., + "variance": 0., + "harm_ratio": 2 +} + +# Specifies the method for averaging analysis frames to create a single value +# for comparing to other grains. Possible formatters are: 'mean', 'median', +# 'log2_mean', 'log2_median' analysis_dict = { + # log2_median formats using mel scale. This is useful for analyses such as + # F0. "f0": "log2_median", - "rms": "mean" + "rms": "mean", + "zerox": "mean", + "spccntr": "median", + "spcsprd": "median", + "spcflux": "median", + "spccf": "median", + "spcflatness": "median", + "peak": "mean", + "centroid": "mean", + "kurtosis": "mean", + "skewness": "mean", + "variance": "mean", + "harm_ratio": "mean" } analysis = { + # Force the deletion of any pre-existing analyses to create new ones. This + # is needed for overwriting old analyses generated with different + # parameters to the current ones. "reanalyse": False } +matcher = { + # Force the re-matching of analyses + "rematch": False, + # This value must be the same as the synthesis grain size to avoid the + # speeding up or slowing down of the resulting file in relation to the + # original. + "grain_size": 100, + "overlap": 8, + # Defines the number of matches to keep for synthesis. Note that this must + # also be specified in the synthesis config + "match_quantity": 5, + # Choose the algorithm used to perform matching. kdtree is recommended for + # larger datasets. + "method": 'kdtree' +} + +synthesizer = { + # Artificially scale the output grain by the difference in RMS values + # between source and target. + "enforce_intensity": True, + # Specify the ratio limit that is the grain can be scaled by. + "enf_intensity_ratio_limit": 1000., + # Artificially modify the pitch by the difference in f0 values between + # source and target. + "enforce_f0": True, + # Specify the ratio limit that is the grain can be modified by. + "enf_f0_ratio_limit": 10., + "grain_size": 100, + "overlap": 8, + # Normalize output, avoid clipping of final output by scaling the final + # frames. + "normalize" : True, + # Defines the number of potential grains to choose from matches when + # synthesizing output. + "match_quantity": 5 +} + +# Specifies the format for the output file. Changing this has not been tested +# so may produce errors/undesirable results. output_file = { "samplerate": 44100, "format": 131075, "channels": 1 } - -database = { - "symlink": True -} diff --git a/src/sppysound/docs/descriptor_defs.rst b/src/sppysound/docs/descriptor_defs.rst index 1bc309c..c220fcb 100644 --- a/src/sppysound/docs/descriptor_defs.rst +++ b/src/sppysound/docs/descriptor_defs.rst @@ -15,7 +15,7 @@ used to determine the central point of a signal's amplitude and is calculated as: .. math:: - C(n) = \frac{\sum_{i=i_s(n)}^{i_e(n)}(i-i_s(n)) \cdot x(i)}{\sum_{i=i_s(n)}^{i_e(n)} \cdot x(n)} + C(n) = \frac{\sum_{i=i_s(n)}^{i_e(n)}(i-i_s(n)) \cdot x(i)}{\sum_{i=i_s(n)}^{i_e(n)} \cdot x(n)}. Ref: :cite:`lerch2012itaca` @@ -34,10 +34,10 @@ defined as: .. math:: R_n(m) = \sum_{i=i_s(n)}^{i_e(n)} x(i) x(i-m) -Then normalizing: +then normalizing: .. math:: - \Gamma_n(m) = \frac{R_n(m)}{\sqrt{\sum_{i=i_s(n)}^{i_e(n)}x(i)^2 \sum_{i=i_s(n)}^{i_e(n)}x(i-m)^2}} + \Gamma_n(m) = \frac{R_n(m)}{\sqrt{\sum_{i=i_s(n)}^{i_e(n)}x(i)^2 \sum_{i=i_s(n)}^{i_e(n)}x(i-m)^2}}. The fundamental period of the signal is then calculated as the point between :math:`T_{min}` and :math:`T_{max}` at which the correlated signal most closely matches the @@ -45,7 +45,7 @@ original. :math:`T_{min}` and :math:`T_{max}` are defined as the minimum and max the fundamental period. .. math:: - y = arg\,max_{T_{min} \leq m \leq T_{max}} \{\Gamma_i(m)\} + y = arg\,max_{T_{min} \leq m \leq T_{max}} \{\Gamma_i(m)\}. In order to improve the accuracy of peak detection, parabolic interpolation is used to estimate the peak's location with greater accuracy by using the peak @@ -66,7 +66,7 @@ Ref: :cite:`smith2011sasp` From this, the fundamental period the frequency is then calculated as: .. math:: - f_0^n = \frac{1}{T_0^n} + f_0^n = \frac{1}{T_0^n}. Ref: :cite:`itaa2014` @@ -83,7 +83,7 @@ the signal. The calculation of the STFT is defined as: .. math:: X(k,n) = \sum_{i=i_s(n)}^{i_e(n)} x(i) \exp{\Big(-jk \cdot (i - - i_s(n))\frac{2\pi}{K}\Big)} + i_s(n))\frac{2\pi}{K}\Big)}. Ref: :cite:`lerch2012itaca` @@ -96,7 +96,7 @@ of confidence measure in determining the validity of F0 values. It is calculated as part of the F0 estimation algorithm as: .. math:: - HR(n) = max_{T_{min} \leq m \leq T_{max}}{\{T_n(m)\}} + HR(n) = max_{T_{min} \leq m \leq T_{max}}{\{T_n(m)\}}. Ref: :cite:`lerch2012itaca` @@ -107,7 +107,7 @@ values indicate a flatter distribution and positive values indicate a more "peaky" distribution. Kurtosis is calculated as: .. math:: - TK(n)=\frac{1}{\sigma_x^4(n) \cdot K}\sum_{i=i_s(n)}^{i_e(n)}\Big(x(i)-\mu_x(n)\Big)^4-3 + TK(n)=\frac{1}{\sigma_x^4(n) \cdot K}\sum_{i=i_s(n)}^{i_e(n)}\Big(x(i)-\mu_x(n)\Big)^4-3. Ref: :cite:`lerch2012itaca` @@ -117,16 +117,17 @@ Peak amplitude measures the highest peak in the absolute signal. It is calculated as: .. math:: - P(n) = \max_{i_s(n) \leq i \leq i_e(n)}\{\left|x(i)\right|\} + P(n) = \max_{i_s(n) \leq i \leq i_e(n)}\{\left|x(i)\right|\}. RMS ~~~ The perceived loudness of a signal is an important feature as it can be related to the dynamics of the signal. RMS is used as a measure of sound intensity and -is used for distinguishing between loud and quiet audio. It is calculated as: +is used for distinguishing between loud and quiet audio. It is calculated as, +where $K$ is the total number of samples: .. math:: - RMS(n) = \sqrt{\frac{1}{K} \sum_{i=i_s(n)}^{i_e(n)} x(i)^2} + RMS(n) = \sqrt{\frac{1}{K} \sum_{i=i_s(n)}^{i_e(n)} x(i)^2}. Other methods that take the human perception of loudness into account may provide more perceptually relevant results. However the RMS measurement @@ -142,7 +143,7 @@ values indicate that the spectral content is centred in higher frequencies and lower value indicate a lower centre. The spectral centroid is calculated as: .. math:: - SC(n) = \frac{\sum_{k=0}^{K/2-1} k \cdot | X(k,n) | ^2}{\sum_{k=0}^{K/2-1} | X(k,n) | ^2} + SC(n) = \frac{\sum_{k=0}^{K/2-1} k \cdot | X(k,n) | ^2}{\sum_{k=0}^{K/2-1} | X(k,n) | ^2}. The result is the sum of magnitudes, weighted by their index, normalized by the unweighted sum. @@ -158,7 +159,7 @@ This differentiates between flat spectrums and sinusoidal spectrums. (low values representing the former and high values representing the latter.) .. math:: - SCF = \frac{ \max_{0 \leq k \leq K/2-1} \{| X(k,n) | \}}{\sum_{k=0}^{K/2-1} | X(k,n) | } + SCF = \frac{ \max_{0 \leq k \leq K/2-1} \{| X(k,n) | \}}{\sum_{k=0}^{K/2-1} | X(k,n) | }. Ref: :cite:`lerch2012itaca` @@ -171,7 +172,7 @@ values that represent a more tonal signal. Spectral flatness is calculated as: .. math:: TFl(n) = \frac{\sqrt[K/2]{\prod_{k=0}^{K/2-1} | X(k,n) | }}{2/K \cdot - \sum_{k=0}^{K/2-1} | X(k,n) | } + \sum_{k=0}^{K/2-1} | X(k,n) | }. Ref: :cite:`lerch2012itaca` @@ -184,7 +185,7 @@ similar frames (that suggests a steady state signal). It is calculated as: .. math:: SF(n) = \frac{\sqrt{\sum_{k=0}^{K/2-1} \Big( | X(k,n) | - | X(k,n-1) | \Big)^2 - }}{K/2} + }}{K/2}. Ref: :cite:`lerch2012itaca` @@ -196,7 +197,7 @@ and is associated with perceptions of timbre. It is calculated as: .. math:: SS(n) = \sqrt{\frac{\sum_{k=0}^{K/2-1} \Big(k-SC(n)\Big)^2 \cdot | X(k,n) - | ^2}{\sum_{k=0}^{K/2-1} | X(k,n) | ^2}} + | ^2}{\sum_{k=0}^{K/2-1} | X(k,n) | ^2}}. Ref: :cite:`lerch2012itaca` @@ -206,7 +207,7 @@ The variance of a signal measures it's spread around the signal's arithmetic mean. It is used in the calculation of Kurtosis and is calculated as: .. math:: - \sigma_x^2 = \frac{1}{K} \sum_{i=i_s(n)}^{i_e(n)}(x(i) - \mu_x(n))^2 + \sigma_x^2 = \frac{1}{K} \sum_{i=i_s(n)}^{i_e(n)}(x(i) - \mu_x(n))^2. Ref: :cite:`lerch2012itaca` @@ -218,7 +219,7 @@ a signal, as noisy signals will pass from positive to negative more frequently than period signals. It is calculated as: .. math:: - Z(n) = \frac{1}{2K} \sum_{i=i_s(n)}^{i_e(n)} | sgn[x(i)] - sgn[x(i-1)] | + Z(n) = \frac{1}{2K} \sum_{i=i_s(n)}^{i_e(n)} | sgn[x(i)] - sgn[x(i-1)] | \text{Where the sgn function is defined as:} diff --git a/src/sppysound/docs/matching_config.py b/src/sppysound/docs/matching_config.py index 4d63733..9300559 100644 --- a/src/sppysound/docs/matching_config.py +++ b/src/sppysound/docs/matching_config.py @@ -1,36 +1,139 @@ +# Specify analysis parameters for root mean square analysis. rms = { + # Analysis window sizes can be changed for each analysis individually. + # These do not need to match the grain size of the matcher or synthesis. "window_size": 100, - "overlap": 2, + "overlap": 8, } -analysis_dict = { - "f0": "log2_median", - "rms": "mean" +f0 = { + "window_size": 4096, + "overlap": 8, + # Currently all frames below this ratio are digaurded and left as silence. + # Different databases will require different values for the best results. + # Noisier databases will need lower values than more tonal databases. + "ratio_threshold": 0.45 } +# Specify analysis parameters for variance analysis. +variance = { + "window_size": 100, + "overlap": 8 +} + +# Specify analysis parameters for temporal kurtosis analysis. +kurtosis = { + "window_size": 100, + "overlap": 8 +} + +# Specify analysis parameters for temporal skewness analysis. +skewness = { + "window_size": 100, + "overlap": 8 +} + +# Specify analysis parameters for FFT analysis. +fft = { + # The FFT window size determines the window size for all spectral analyses. + "window_size": 4096 +} + +database = { + # Enables creation of symbolic links to files not in the database rather + # than making pysical copies. + "symlink": True +} + +# Sets the weighting for each analysis. a higher weighting gives an analysis +# higher presendence when finding the best matches. matcher_weightings = { - "f0" : 1., - "rms": 1. + "f0" : 0.5, + "spccntr" : 1., + "spcsprd" : 1., + "spcflux" : 3., + "spccf" : 3., + "spcflatness": 3., + "zerox" : 1., + "rms" : 0.1, + "peak": 0.1, + "centroid": 0.5, + "kurtosis": 2., + "skewness": 2., + "variance": 0., + "harm_ratio": 2 +} + +# Specifies the method for averaging analysis frames to create a single value +# for comparing to other grains. Possible formatters are: 'mean', 'median', +# 'log2_mean', 'log2_median' +analysis_dict = { + # log2_median formats using mel scale. This is useful for analyses such as + # F0. + "f0": "log2_median", + "rms": "mean", + "zerox": "mean", + "spccntr": "median", + "spcsprd": "median", + "spcflux": "median", + "spccf": "median", + "spcflatness": "median", + "peak": "mean", + "centroid": "mean", + "kurtosis": "mean", + "skewness": "mean", + "variance": "mean", + "harm_ratio": "mean" } analysis = { + # Force the deletion of any pre-existing analyses to create new ones. This + # is needed for overwriting old analyses generated with different + # parameters to the current ones. "reanalyse": False } matcher = { + # Force the re-matching of analyses "rematch": False, + # This value must be the same as the synthesis grain size to avoid the + # speeding up or slowing down of the resulting file in relation to the + # original. "grain_size": 100, - "overlap": 2, - # Defines the number of matches to keep for synthesis. - "match_quantity": 20 + "overlap": 8, + # Defines the number of matches to keep for synthesis. Note that this must + # also be specified in the synthesis config + "match_quantity": 5, + # Choose the algorithm used to perform matching. kdtree is recommended for + # larger datasets. + "method": 'kdtree' } +synthesizer = { + # Artificially scale the output grain by the difference in RMS values + # between source and target. + "enforce_intensity": True, + # Specify the ratio limit that is the grain can be scaled by. + "enf_intensity_ratio_limit": 1000., + # Artificially modify the pitch by the difference in f0 values between + # source and target. + "enforce_f0": True, + # Specify the ratio limit that is the grain can be modified by. + "enf_f0_ratio_limit": 10., + "grain_size": 100, + "overlap": 8, + # Normalize output, avoid clipping of final output by scaling the final + # frames. + "normalize" : True, + # Defines the number of potential grains to choose from matches when + # synthesizing output. + "match_quantity": 5 +} + +# Specifies the format for the output file. Changing this has not been tested +# so may produce errors/undesirable results. output_file = { "samplerate": 44100, "format": 131075, "channels": 1 } - -database = { - "symlink": True -} diff --git a/src/sppysound/docs/synthesis_config.py b/src/sppysound/docs/synthesis_config.py index a3afa73..9300559 100644 --- a/src/sppysound/docs/synthesis_config.py +++ b/src/sppysound/docs/synthesis_config.py @@ -1,36 +1,139 @@ +# Specify analysis parameters for root mean square analysis. rms = { + # Analysis window sizes can be changed for each analysis individually. + # These do not need to match the grain size of the matcher or synthesis. "window_size": 100, - "overlap": 2, + "overlap": 8, } +f0 = { + "window_size": 4096, + "overlap": 8, + # Currently all frames below this ratio are digaurded and left as silence. + # Different databases will require different values for the best results. + # Noisier databases will need lower values than more tonal databases. + "ratio_threshold": 0.45 +} + +# Specify analysis parameters for variance analysis. +variance = { + "window_size": 100, + "overlap": 8 +} + +# Specify analysis parameters for temporal kurtosis analysis. +kurtosis = { + "window_size": 100, + "overlap": 8 +} + +# Specify analysis parameters for temporal skewness analysis. +skewness = { + "window_size": 100, + "overlap": 8 +} + +# Specify analysis parameters for FFT analysis. +fft = { + # The FFT window size determines the window size for all spectral analyses. + "window_size": 4096 +} + +database = { + # Enables creation of symbolic links to files not in the database rather + # than making pysical copies. + "symlink": True +} + +# Sets the weighting for each analysis. a higher weighting gives an analysis +# higher presendence when finding the best matches. +matcher_weightings = { + "f0" : 0.5, + "spccntr" : 1., + "spcsprd" : 1., + "spcflux" : 3., + "spccf" : 3., + "spcflatness": 3., + "zerox" : 1., + "rms" : 0.1, + "peak": 0.1, + "centroid": 0.5, + "kurtosis": 2., + "skewness": 2., + "variance": 0., + "harm_ratio": 2 +} + +# Specifies the method for averaging analysis frames to create a single value +# for comparing to other grains. Possible formatters are: 'mean', 'median', +# 'log2_mean', 'log2_median' analysis_dict = { + # log2_median formats using mel scale. This is useful for analyses such as + # F0. "f0": "log2_median", - "rms": "mean" + "rms": "mean", + "zerox": "mean", + "spccntr": "median", + "spcsprd": "median", + "spcflux": "median", + "spccf": "median", + "spcflatness": "median", + "peak": "mean", + "centroid": "mean", + "kurtosis": "mean", + "skewness": "mean", + "variance": "mean", + "harm_ratio": "mean" } analysis = { + # Force the deletion of any pre-existing analyses to create new ones. This + # is needed for overwriting old analyses generated with different + # parameters to the current ones. "reanalyse": False } +matcher = { + # Force the re-matching of analyses + "rematch": False, + # This value must be the same as the synthesis grain size to avoid the + # speeding up or slowing down of the resulting file in relation to the + # original. + "grain_size": 100, + "overlap": 8, + # Defines the number of matches to keep for synthesis. Note that this must + # also be specified in the synthesis config + "match_quantity": 5, + # Choose the algorithm used to perform matching. kdtree is recommended for + # larger datasets. + "method": 'kdtree' +} + +synthesizer = { + # Artificially scale the output grain by the difference in RMS values + # between source and target. + "enforce_intensity": True, + # Specify the ratio limit that is the grain can be scaled by. + "enf_intensity_ratio_limit": 1000., + # Artificially modify the pitch by the difference in f0 values between + # source and target. + "enforce_f0": True, + # Specify the ratio limit that is the grain can be modified by. + "enf_f0_ratio_limit": 10., + "grain_size": 100, + "overlap": 8, + # Normalize output, avoid clipping of final output by scaling the final + # frames. + "normalize" : True, + # Defines the number of potential grains to choose from matches when + # synthesizing output. + "match_quantity": 5 +} + +# Specifies the format for the output file. Changing this has not been tested +# so may produce errors/undesirable results. output_file = { "samplerate": 44100, "format": 131075, "channels": 1 } - -synthesizer = { - "enforce_rms": True, - "enf_rms_ratio_limit": 5., - "enforce_f0": True, - "enf_f0_ratio_limit": 10., - "grain_size": 100, - "overlap": 2, - "normalize" : True, - # Defines the number of potential grains to choose from matches when - # synthesizing output. - "match_quantity": 20 -} - -database = { - "symlink": True -} diff --git a/src/sppysound/docs/tutorial.rst b/src/sppysound/docs/tutorial.rst index 38a5a31..9e9f1e5 100644 --- a/src/sppysound/docs/tutorial.rst +++ b/src/sppysound/docs/tutorial.rst @@ -84,7 +84,7 @@ For this demonstration, the following file structure will be used: |-- target.03.wav `-- target.04.wav -A source database containing a small selection of trumpet samples (aquired from +A source database containing a small selection of trumpet samples (acquired from http://theremin.music.uiowa.edu/MIS.html) will be used to match grains with 4 target sounds. This will produce 4 output files, one for each target sound. @@ -417,7 +417,9 @@ concatenate.py Script Flags --match_method Choose the algorithm to use when matching analyses. Available algorithms are: - Brute force: 'bruteforce' + Brute force: 'bruteforce' (BROKEN. The brute force + matcher no longer works with the current release of + this script. Use the K-d Tree Search.) K-d Tree Search: 'kdtree'