18 Commits

Author SHA1 Message Date
Sam Perry 4465fae7d3 Final modifications before submission 2017-04-03 21:08:23 +01:00
Sam Perry 02e426a8e8 Actually fixed numpy indexing bug 2017-04-02 12:32:00 +01:00
bp 2a16ace7a5 Fixed minor issue in F0 analysis 2017-04-02 10:30:37 +01:00
Sam Perry 38ea3ff76e Implemented temporary fix for pitch shifting issues 2017-04-02 10:19:45 +01:00
bp b581e578b9 Tryed to fix pitch shifting 2017-04-02 09:45:51 +01:00
Sam Perry bc578eee73 Merge branch 'dev' of https://github.com/Pezz89/PySoundConcat into dev 2017-04-01 11:03:06 +01:00
Sam Perry 0b6d56392e Fixed grain indexing bug 2017-03-26 19:22:55 +01:00
Sam Perry 4ec7b13bcf Paramterzed inharmonic grain silencing in synthesizer 2017-03-26 10:08:35 +01:00
Sam Perry 0cac135882 Commented up continuity matching 2017-03-24 14:11:47 +00:00
Sam Perry 2182774048 Added naive approach to continuity matching 2017-03-24 09:32:55 +00:00
Sam Perry e10a7f2f18 Begun implementing continuity match selection 2017-03-23 16:48:21 +00:00
Sam Perry 86fda63151 Begun implementing brute-force match continuity selector 2017-03-19 17:39:05 +00:00
Sam Perry d87b9cc3ed Minor changes 2017-03-18 21:18:41 +00:00
Sam Perry bc6bb4ea83 Added pYIN F0 estimation 2017-03-18 19:33:26 +00:00
Sam Perry 5e0e4dd5ed Fixed import issues in F0 analysis 2017-03-18 12:01:36 +00:00
Sam Perry a123097ab3 Updated requirements.txt and revised installer 2017-03-18 11:00:52 +00:00
Sam Perry e9c9ca3bff Merged overcommit files 2017-03-18 10:12:24 +00:00
Sam Perry dd235ef977 Setup new project for QMUL dev 2017-03-18 10:02:02 +00:00
54 changed files with 1841 additions and 306 deletions
+47
View File
@@ -0,0 +1,47 @@
#!/usr/bin/env python
import subprocess
import os
import pdb
import fnmatch
import sys
def main():
p = subprocess.Popen(["git", "rev-parse", "--show-toplevel"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = p.communicate()
out = out.strip('\n')
track_filepath = os.path.join(out, ".gittrack")
p = subprocess.Popen(["git", "ls-files", out, "--exclude-standard", "--others"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = p.communicate()
out = out.splitlines()
try:
with open(track_filepath) as f:
content = f.read().splitlines()
except IOError:
return 0
untracked = []
for filepath in out:
for name in content:
if fnmatch.fnmatch(filepath, name):
untracked.append(filepath)
if untracked:
print "The following files are not tracked: "
for i in untracked:
print i
print "Please either stage these files for the commit or add them to the project's .gitignore to disregard them."
return 1
else:
return 0
if __name__ == "__main__":
exit(main())
+39
View File
@@ -5,3 +5,42 @@
Session.vim
pip-selfcheck.json
.DS_Store
src/spfileops/
playground
bin/
# Compiled Object files
*.slo
*.lo
*.o
*.obj
# Precompiled Headers
*.gch
*.pch
# Compiled Dynamic libraries
*.so
*.dylib
*.dll
# Fortran module files
*.mod
*.smod
# Compiled Static libraries
*.lai
*.la
*.a
*.lib
# Executables
*.exe
*.out
*.app
# Log files
*.log
external/
CMakeFiles/
+2
View File
@@ -0,0 +1,2 @@
*.py
*.sh
+41
View File
@@ -0,0 +1,41 @@
# Use this file to configure the Overcommit hooks you wish to use. This will
# extend the default configuration defined in:
# https://github.com/brigade/overcommit/blob/master/config/default.yml
#
# At the topmost level of this YAML file is a key representing type of hook
# being run (e.g. pre-commit, commit-msg, etc.). Within each type you can
# customize each hook, such as whether to only run it on certain files (via
# `include`), whether to only display output if it fails (via `quiet`), etc.
#
# For a complete list of hooks, see:
# https://github.com/brigade/overcommit/tree/master/lib/overcommit/hook
#
# For a complete list of options that you can use to customize hooks, see:
# https://github.com/brigade/overcommit#configuration
#
# Uncomment the following lines to make the configuration take effect.
#PreCommit:
# RuboCop:
# enabled: true
# on_warn: fail # Treat all warnings as failures
#
# TrailingWhitespace:
# enabled: true
# exclude:
# - '**/db/structure.sql' # Ignore trailing whitespace in generated files
#
#PostCheckout:
# ALL: # Special hook name that customizes all hooks of this type
# quiet: true # Change all post-checkout hooks to only display output on failure
#
# IndexTags:
# enabled: true # Generate a tags file with `ctags` each time HEAD changes
PreCommit:
CheckUntracked:
enabled: true
quiet: false
description: 'Check for files that should be tracked or ignored.'
required_executable: './.git-hooks/pre-commit/check_untracked.py'
+1
View File
@@ -0,0 +1 @@
2.7.11
-1
View File
@@ -1 +0,0 @@
./src/sppysound/concatenator.py
-1
View File
@@ -1 +0,0 @@
./src/sppysound/config.py
-1
View File
@@ -1 +0,0 @@
./src/sppysound/Examples
+1
View File
@@ -0,0 +1 @@
pip install -r requirements.txt
-8
View File
@@ -1,8 +0,0 @@
set -euo pipefail
pip install numpy
pip install scipy
pip install pysndfile
pip install h5py
pip install https://github.com/Pezz89/fileops/zipball/master
pip install -e ./
pip install sklearn
+8
View File
@@ -0,0 +1,8 @@
-e git://github.com/Pezz89/SPFileOps@9bab7f59c339443903dea6cf42873a335f966d28#egg=spfileops
h5py==2.6.0
matplotlib==1.5.3
numpy==1.11.2
pysndfile==0.2.11
scipy==0.18.1
sphinx_bootstrap_theme==0.4.12
scikit_learn==0.18.1
-1
View File
@@ -1 +0,0 @@
./src/tests/run_tests.sh
+1 -2
View File
@@ -86,8 +86,7 @@ class Analysis(object):
size containing frames for these times.
"""
times = self.analysis_group[self.name]["times"][:]
start = start / 1000
end = end / 1000
# Seconds to milliseconds
vtimes = times.reshape(-1, 1)
selection = np.transpose((vtimes >= start) & (vtimes <= end))
+2 -3
View File
@@ -68,7 +68,7 @@ class CentroidAnalysis(Analysis):
hopSize = int(window_size - np.floor(overlapFac * window_size))
# zeros at beginning (thus center of 1st window should be for sample nr. 0)
samples = np.append(np.zeros(np.floor(window_size/2.0)), frames)
samples = np.append(np.zeros(int(window_size/2.0)), frames)
# cols for windowing
cols = np.ceil((len(samples) - window_size) / float(hopSize)) + 1
@@ -77,7 +77,7 @@ class CentroidAnalysis(Analysis):
frames = stride_tricks.as_strided(
samples,
shape=(cols, window_size),
shape=(int(cols), window_size),
strides=(samples.strides[0]*hopSize, samples.strides[0])
).copy()
@@ -112,5 +112,4 @@ class CentroidAnalysis(Analysis):
# multiply by the frame numbers.
centroid_times = (float(sample_frames.shape[0])/float(timebins)) * scale[:-1].astype(float)
# Divide by the samplerate to give times in seconds
centroid_times = centroid_times / samplerate
return centroid_times
+27 -114
View File
@@ -7,9 +7,11 @@ from numpy.lib import stride_tricks
from Analysis import Analysis
from scipy import signal
from numpy.fft import fft, ifft, fftshift
from sppysound import multirate
import multirate
import warnings
import matplotlib.pyplot as plt
import pYIN.pYINmain as pYINmain
from numpy import polyfit, arange
class F0Analysis(Analysis):
@@ -50,6 +52,11 @@ class F0Analysis(Analysis):
self.analysis_group = analysis_group
self.logger.info("Creating F0 analysis for {0}".format(self.AnalysedAudioFile.name))
self.hopSize = int(np.floor(self.overlap * self.window_size))
self.pYinInst = pYINmain.PyinMain()
self.pYinInst.initialise(channels = 1, inputSampleRate = self.AnalysedAudioFile.samplerate, stepSize = self.hopSize, blockSize = self.window_size,
lowAmp = 0.00, onsetSensitivity = 0.5, pruneThresh = 0.0)
self.create_analysis(
frames,
self.AnalysedAudioFile.samplerate,
@@ -66,13 +73,7 @@ class F0Analysis(Analysis):
"""
times = self.analysis_group["F0"]["times"][:]
frames = self.analysis_group["F0"]["frames"][:]
hr = self.analysis_group["F0"]["harmonic_ratio"][:]
start = start / 1000
end = end / 1000
vtimes = times.reshape(-1, 1)
nan_inds = hr < self.threshold
hr[nan_inds] = np.nan
frames[nan_inds] = np.nan
selection = np.transpose((vtimes >= start) & (vtimes <= end))
if not selection.any():
@@ -80,10 +81,10 @@ class F0Analysis(Analysis):
closest_frames = np.abs(vtimes-frame_center).argsort()[:2]
selection[closest_frames] = True
return ((frames, times, hr), selection)
return ((frames, times), selection)
@staticmethod
def create_f0_analysis(
self,
frames,
samplerate,
window_size=512,
@@ -98,122 +99,38 @@ class F0Analysis(Analysis):
Calculate the frequency and harmonic ratio values of windowed segments
of the audio file and save to disk.
"""
if hasattr(frames, '__call__'):
frames = frames()
if not M:
M=int(round(0.016*samplerate))
hopSize = int(window_size - np.floor(overlapFac * window_size))
# zeros at beginning (thus center of 1st window should be for sample nr. 0)
samples = frames
#samples = np.concatenate((np.zeros(np.floor(window_size/2.0)), frames))
# cols for windowing
cols = np.ceil((len(samples) - window_size) / float(hopSize)) + 1
cols = np.ceil((len(samples) - window_size) / float(self.hopSize)) + 1
# zeros at end (thus samples can be fully covered by frames)
samples = np.concatenate((samples, np.zeros(window_size)))
frames = stride_tricks.as_strided(
samples,
shape=(cols, window_size),
strides=(samples.strides[0]*hopSize, samples.strides[0])
).copy()
shape=(int(cols), window_size),
strides=(samples.strides[0]*self.hopSize, samples.strides[0])
)
# TODO: Replace this with zero crossing object.
def feature_zcr(window):
window2 = np.zeros(window.size)
window2[1:-1] = window[0:-2]
Z = (1/(2*window.size)) * np.sum(np.abs(np.sign(window)-np.sign(window2)))
return Z
for ind, frame in enumerate(frames):
fs = self.pYinInst.process(frame)
output = self.pYinInst.getSmoothedPitchTrack()
# TODO: Create proper fix for this
try:
output[output < 0] = np.nan
except:
output = np.array([np.nan])
def parabolic(f, x):
"""
Quadratic interpolation for estimating the true position of an
inter-sample maximum when nearby samples are known.
f is a vector and x is an index for that vector.
Returns (vx, vy), the coordinates of the vertex of a parabola that
goes through point x and its two neighbors.
Example:
Defining a vector f with a local maximum at index 3 (= 6), find
local maximum if points 2, 3, and 4 actually defined a parabola.
In [3]: f = [2, 3, 1, 6, 4, 2, 3, 1]
In [4]: parabolic(f, argmax(f))
Out[4]: (3.2142857142857144, 6.1607142857142856)
Ref: https://gist.github.com/endolith/255291
"""
if x >= f.size-1 or x <= 2:
return x, f[x]
xv = 1/2. * (f[x-1] - f[x+1]) / (f[x-1] - 2 * f[x] + f[x+1]) + x
yv = f[x] - 1/4. * (f[x-1] - f[x+1]) * (xv - x)
return (xv, yv)
def per_frame_f0(frames, m0, M):
if not frames.any():
HR = np.nan
f0 = np.nan
return f0, HR
#R=autocorr([frames])[0]
R = np.correlate(frames, frames, mode='full')
g=R[frames.size]
R=R[frames.size-1:]
if not m0:
# estimate m0 (as the first zero crossing of R)
m0 = np.argmin(np.diff(np.sign(R[1:])))+1
if m0 == 1:
m0 = R.size
if M > R.size:
M = R.size
Gamma = np.zeros(M)
CSum = np.cumsum(frames*frames)
with warnings.catch_warnings():
warnings.filterwarnings('error')
try:
Gamma[m0:M] = R[m0:M] / (np.sqrt([g*CSum[-m0:-M:-1]])+np.finfo(float).eps)
except Warning:
pass
# compute T0 and harmonic ratio:
if np.isnan(Gamma).any():
HR = np.nan
f0 = np.nan
else:
blag = np.argmax(Gamma)
HR = Gamma[blag]
interp, HR = parabolic(Gamma, blag)
if not interp:
f0 = np.nan
HR = np.nan
else:
# get fundamental frequency:
f0 = samplerate / interp
if f0 > samplerate/2:
raise ValueError("F0 value ({0}) is above the nyquist rate "
"({1}). This shouldn't happen...".format(f0,
samplerate/2))
if HR >= 1:
HR = 1
return (f0, HR)
output = np.apply_along_axis(per_frame_f0, 1, frames, m0, M)
# output = np.empty((frames.shape[0], 2))
# for ind, i in enumerate(frames):
# output[ind] = per_frame_f0(i, m0, M)
return output
@@ -224,12 +141,9 @@ class F0Analysis(Analysis):
samplerate = self.AnalysedAudioFile.samplerate
frames = args[0]
# frames = multirate.interp(frames, 4)
samplerate *= 1
data = self.create_f0_analysis(frames, samplerate, **kwargs)
f0 = data[:, 0]
harmonic_ratio = data[:, 1]
f0 = self.create_f0_analysis(frames, samplerate, **kwargs)
f0_times = self.calc_f0_frame_times(f0, frames, samplerate)
return ({'frames': f0, 'harmonic_ratio': harmonic_ratio, 'times': f0_times}, {})
return ({'frames': f0, 'times': f0_times}, {})
@staticmethod
def calc_f0_frame_times(f0frames, sample_frames, samplerate):
@@ -246,14 +160,13 @@ class F0Analysis(Analysis):
# multiply by the frame numbers.
f0_times = (sample_frames.shape[0]/timebins) * scale[:-1]
# Divide by the samplerate to give times in seconds
f0_times = f0_times / samplerate
return f0_times
def analysis_formatter(self, data, selection, format):
"""Calculate the average analysis value of the grain using the match format specified."""
frames, times, harm_ratio = data
frames, times = data
# Get indexes of all valid frames (that aren't nan)
valid_inds = np.isfinite(frames) & np.isfinite(harm_ratio)
valid_inds = np.isfinite(frames)
format_style_dict = {
'mean': np.mean,
@@ -46,8 +46,6 @@ class F0HarmRatioAnalysis(Analysis):
"""
times = self.analysis_group["F0"]["times"][:]
hr = self.analysis_group["F0"]["harmonic_ratio"][:]
start = start / 1000
end = end / 1000
vtimes = times.reshape(-1, 1)
nan_inds = hr < self.threshold
@@ -65,7 +63,6 @@ class F0HarmRatioAnalysis(Analysis):
def calc_F0HarmRatio_frame_times(F0HarmRatioframes, sample_frames, samplerate):
"""Calculate times for frames using sample size and samplerate."""
samplerate *= 1
if hasattr(sample_frames, '__call__'):
sample_frames = sample_frames()
@@ -77,7 +74,6 @@ class F0HarmRatioAnalysis(Analysis):
# multiply by the frame numbers.
F0HarmRatio_times = (sample_frames.shape[0]/timebins) * scale[:-1]
# Divide by the samplerate to give times in seconds
F0HarmRatio_times = F0HarmRatio_times / samplerate
return F0HarmRatio_times
def analysis_formatter(self, data, selection, format):
+2 -5
View File
@@ -77,8 +77,6 @@ class FFTAnalysis(Analysis):
size containing frames for these times.
"""
times = self.analysis_group["FFT"]["times"][:]
start = start / 1000
end = end / 1000
vtimes = times.reshape(-1, 1)
selection = np.transpose((vtimes >= start) & (vtimes <= end))
@@ -127,8 +125,8 @@ class FFTAnalysis(Analysis):
frames = stride_tricks.as_strided(
samples,
shape=(cols, frameSize),
strides=(samples.strides[0]*hopSize, samples.strides[0])
shape=(int(cols), frameSize),
strides=(int(samples.strides[0]*hopSize), samples.strides[0])
).copy()
frames *= win
@@ -230,7 +228,6 @@ class FFTAnalysis(Analysis):
# multiply by the frame numbers.
fft_times = (sample_frames.shape[0]/timebins) * scale[:-1]
# Divide by the samplerate to give times in seconds
fft_times = fft_times / samplerate
return fft_times
+7 -5
View File
@@ -37,7 +37,7 @@ class KurtosisAnalysis(Analysis):
self.AnalysedAudioFile = AnalysedAudioFile
if config:
self.window_size = config.kurtosis["window_size"] * self.AnalysedAudioFile.samplerate / 1000
self.window_size = config.kurtosis["window_size"]
self.overlap = 1. / config.kurtosis["overlap"]
try:
@@ -75,7 +75,7 @@ class KurtosisAnalysis(Analysis):
hopSize = int(window_size - np.floor(overlapFac * window_size))
# zeros at beginning (thus center of 1st window should be for sample nr. 0)
samples = np.append(np.zeros(np.floor(window_size/2.0)), frames)
samples = np.append(np.zeros(int(window_size/2.0)), frames)
# cols for windowing
cols = np.ceil((len(samples) - window_size) / float(hopSize)) + 1
@@ -84,13 +84,16 @@ class KurtosisAnalysis(Analysis):
frames = stride_tricks.as_strided(
samples,
shape=(cols, window_size),
shape=(int(cols), window_size),
strides=(samples.strides[0]*hopSize, samples.strides[0])
).copy()
if window:
win = window(window_size)
frames *= win
try:
frames *= win
except:
pdb.set_trace()
frame_mean = np.mean(frames, axis=1)
@@ -126,5 +129,4 @@ class KurtosisAnalysis(Analysis):
# multiply by the frame numbers.
kurtosis_times = (float(sample_frames.shape[0])/float(timebins)) * scale[:-1].astype(float)
# Divide by the samplerate to give times in seconds
kurtosis_times = kurtosis_times / samplerate
return kurtosis_times
+2 -3
View File
@@ -63,7 +63,7 @@ class PeakAnalysis(Analysis):
hopSize = int(window_size - np.floor(overlapFac * window_size))
# zeros at beginning (thus center of 1st window should be for sample nr. 0)
samples = np.append(np.zeros(np.floor(window_size/2.0)), frames)
samples = np.append(np.zeros(int(window_size/2.0)), frames)
# cols for windowing
cols = np.ceil((len(samples) - window_size) / float(hopSize)) + 1
@@ -72,7 +72,7 @@ class PeakAnalysis(Analysis):
frames = stride_tricks.as_strided(
samples,
shape=(cols, window_size),
shape=(int(cols), window_size),
strides=(samples.strides[0]*hopSize, samples.strides[0])
).copy()
@@ -104,5 +104,4 @@ class PeakAnalysis(Analysis):
# multiply by the frame numbers.
peak_times = (float(sample_frames.shape[0])/float(timebins)) * scale[:-1].astype(float)
# Divide by the samplerate to give times in seconds
peak_times = peak_times / samplerate
return peak_times
+3 -4
View File
@@ -40,7 +40,7 @@ class RMSAnalysis(Analysis):
self.AnalysedAudioFile = AnalysedAudioFile
if config:
self.window_size = config.rms["window_size"] * self.AnalysedAudioFile.samplerate / 1000
self.window_size = config.rms["window_size"]
self.overlap = 1. / config.rms["overlap"]
else:
self.window_size=512
@@ -88,7 +88,7 @@ class RMSAnalysis(Analysis):
hopSize = int(window_size - np.floor(overlapFac * window_size))
# zeros at beginning (thus center of 1st window should be for sample nr. 0)
samples = np.append(np.zeros(np.floor(window_size/2.0)), frames)
samples = np.append(np.zeros(int(window_size/2.0)), frames)
# cols for windowing
cols = np.ceil((len(samples) - window_size) / float(hopSize)) + 1
@@ -97,7 +97,7 @@ class RMSAnalysis(Analysis):
frames = stride_tricks.as_strided(
samples,
shape=(cols, window_size),
shape=(int(cols), window_size),
strides=(samples.strides[0]*hopSize, samples.strides[0])
).copy()
@@ -133,5 +133,4 @@ class RMSAnalysis(Analysis):
# multiply by the frame numbers.
rms_times = (float(sample_frames.shape[0])/float(timebins)) * scale[:-1].astype(float)
# Divide by the samplerate to give times in seconds
rms_times = rms_times / samplerate
return rms_times
+3 -4
View File
@@ -37,7 +37,7 @@ class SkewnessAnalysis(Analysis):
self.AnalysedAudioFile = AnalysedAudioFile
if config:
self.window_size = config.skewness["window_size"] * self.AnalysedAudioFile.samplerate / 1000
self.window_size = config.skewness["window_size"]
self.overlap = 1. / config.skewness["overlap"]
try:
@@ -75,7 +75,7 @@ class SkewnessAnalysis(Analysis):
hopSize = int(window_size - np.floor(overlapFac * window_size))
# zeros at beginning (thus center of 1st window should be for sample nr. 0)
samples = np.append(np.zeros(np.floor(window_size/2.0)), frames)
samples = np.append(np.zeros(int(window_size/2.0)), frames)
# cols for windowing
cols = np.ceil((len(samples) - window_size) / float(hopSize)) + 1
@@ -84,7 +84,7 @@ class SkewnessAnalysis(Analysis):
frames = stride_tricks.as_strided(
samples,
shape=(cols, window_size),
shape=(int(cols), window_size),
strides=(samples.strides[0]*hopSize, samples.strides[0])
).copy()
@@ -125,5 +125,4 @@ class SkewnessAnalysis(Analysis):
# multiply by the frame numbers.
skewness_times = (float(sample_frames.shape[0])/float(timebins)) * scale[:-1].astype(float)
# Divide by the samplerate to give times in seconds
skewness_times = skewness_times / samplerate
return skewness_times
@@ -97,6 +97,5 @@ class SpectralCentroidAnalysis(Analysis):
# multiply by the frame numbers.
spccntr_times = (float(sample_frame_count)/float(timebins)) * scale[:-1].astype(float)
# Divide by the samplerate to give times in seconds
spccntr_times = spccntr_times / samplerate
return spccntr_times
@@ -88,7 +88,6 @@ class SpectralCrestFactorAnalysis(Analysis):
# multiply by the frame numbers.
spccf_times = (float(sample_frame_count)/float(timebins)) * scale[:-1].astype(float)
# Divide by the samplerate to give times in seconds
spccf_times = spccf_times / samplerate
return spccf_times
def mean_formatter(self, data):
@@ -91,7 +91,6 @@ class SpectralFlatnessAnalysis(Analysis):
# multiply by the frame numbers.
spcflatness_times = (float(sample_frame_count)/float(timebins)) * scale[:-1].astype(float)
# Divide by the samplerate to give times in seconds
spcflatness_times = spcflatness_times / samplerate
return spcflatness_times
def mean_formatter(self, data):
@@ -92,7 +92,6 @@ class SpectralFluxAnalysis(Analysis):
# multiply by the frame numbers.
spcflux_times = (float(sample_frame_count)/float(timebins)) * scale[:-1].astype(float)
# Divide by the samplerate to give times in seconds
spcflux_times = spcflux_times / samplerate
return spcflux_times
def mean_formatter(self, data):
@@ -107,7 +107,6 @@ class SpectralSpreadAnalysis(Analysis):
# multiply by the frame numbers.
spcsprd_times = (float(sample_frame_count)/float(timebins)) * scale[:-1].astype(float)
# Divide by the samplerate to give times in seconds
spcsprd_times = spcsprd_times / samplerate
return spcsprd_times
def mean_formatter(self, data):
+3 -4
View File
@@ -39,7 +39,7 @@ class VarianceAnalysis(Analysis):
self.AnalysedAudioFile = AnalysedAudioFile
if config:
self.window_size = config.variance["window_size"] * self.AnalysedAudioFile.samplerate / 1000
self.window_size = config.variance["window_size"]
self.overlap = 1. / config.variance["overlap"]
self.analysis_group = analysis_group
@@ -68,7 +68,7 @@ class VarianceAnalysis(Analysis):
hopSize = int(window_size - np.floor(overlapFac * window_size))
# zeros at beginning (thus center of 1st window should be for sample nr. 0)
samples = np.append(np.zeros(np.floor(window_size/2.0)), frames)
samples = np.append(np.zeros(int(window_size/2.0)), frames)
# cols for windowing
cols = np.ceil((len(samples) - window_size) / float(hopSize)) + 1
@@ -77,7 +77,7 @@ class VarianceAnalysis(Analysis):
frames = stride_tricks.as_strided(
samples,
shape=(cols, window_size),
shape=(int(cols), window_size),
strides=(samples.strides[0]*hopSize, samples.strides[0])
).copy()
@@ -110,5 +110,4 @@ class VarianceAnalysis(Analysis):
# multiply by the frame numbers.
variance_times = (float(sample_frames.shape[0])/float(timebins)) * scale[:-1].astype(float)
# Divide by the samplerate to give times in seconds
variance_times = variance_times / samplerate
return variance_times
+2 -3
View File
@@ -47,7 +47,7 @@ class ZeroXAnalysis(Analysis):
hopSize = int(window_size - np.floor(overlapFac * window_size))
# zeros at beginning (thus center of 1st window should be for sample nr. 0)
samples = np.append(np.zeros(np.floor(window_size/2.0)), frames)
samples = np.append(np.zeros(int(window_size/2.0)), frames)
# cols for windowing
cols = np.ceil((len(samples) - window_size) / float(hopSize)) + 1
@@ -61,7 +61,7 @@ class ZeroXAnalysis(Analysis):
frames = stride_tricks.as_strided(
samples,
shape=(cols, window_size),
shape=(int(cols), window_size),
strides=(samples.strides[0]*hopSize, samples.strides[0])
).copy()
zero_crossing = np.sum(np.abs(np.diff(np.sign(frames))), axis=1)
@@ -82,7 +82,6 @@ class ZeroXAnalysis(Analysis):
# multiply by the frame numbers.
zerox_times = (sample_frames.shape[0]/timebins) * scale[:-1]
# Divide by the samplerate to give times in seconds
zerox_times = zerox_times / samplerate
return zerox_times
def hdf5_dataset_formatter(self, *args, **kwargs):
+74
View File
@@ -0,0 +1,74 @@
# -*- coding: utf-8 -*-
'''
* Copyright (C) 2015 Music Technology Group - Universitat Pompeu Fabra
*
* This file is part of pypYIN
*
* pypYIN is free software: you can redistribute it and/or modify it under
* the terms of the GNU Affero General Public License as published by the Free
* Software Foundation (FSF), either version 3 of the License, or (at your
* option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the Affero GNU General Public License
* version 3 along with this program. If not, see http://www.gnu.org/licenses/
*
* If you have any problem about this python version code, please contact: Rong Gong
* rong.gong@upf.edu
*
* If you have any problem about this algorithm, I suggest you to contact: Matthias Mauch
* m.mauch@qmul.ac.uk who is the original C++ version author of this algorithm
*
* If you want to refer this code, please consider this article:
*
* M. Mauch and S. Dixon,
* “pYIN: A Fundamental Frequency Estimator Using Probabilistic Threshold Distributions”,
* in Proceedings of the IEEE International Conference on Acoustics,
* Speech, and Signal Processing (ICASSP 2014), 2014.
*
* M. Mauch, C. Cannam, R. Bittner, G. Fazekas, J. Salamon, J. Dai, J. Bello and S. Dixon,
* “Computer-aided Melody Note Transcription Using the Tony Software: Accuracy and Efficiency”,
* in Proceedings of the First International Conference on Technologies for
* Music Notation and Representation, 2015.
'''
from MonoNoteHMM import MonoNoteHMM
from MonoNoteParameters import MonoNoteParameters
import numpy as np
import time
class FrameOutput(object):
def __init__(self, frameNumber, pitch, noteState):
self.frameNumber = frameNumber
self.pitch = pitch
self.noteState = noteState
class MonoNote(object):
def __init__(self):
self.hmm = MonoNoteHMM()
def process(self, pitchProb):
obsProb = [self.hmm.calculatedObsProb(pitchProb[0]), ]
for iFrame in range(1, len(pitchProb)):
obsProb += [self.hmm.calculatedObsProb(pitchProb[iFrame])]
out = []
path, scale = self.hmm.decodeViterbi(obsProb)
for iFrame in range(len(path)):
currPitch = -1.0
stateKind = 0
currPitch = self.hmm.par.minPitch + (path[iFrame]/self.hmm.par.nSPP) * 1.0/self.hmm.par.nPPS
stateKind = (path[iFrame]) % self.hmm.par.nSPP + 1
out.append(FrameOutput(iFrame, currPitch, stateKind))
return out
+187
View File
@@ -0,0 +1,187 @@
# -*- coding: utf-8 -*-
'''
* Copyright (C) 2015 Music Technology Group - Universitat Pompeu Fabra
*
* This file is part of pypYIN
*
* pypYIN is free software: you can redistribute it and/or modify it under
* the terms of the GNU Affero General Public License as published by the Free
* Software Foundation (FSF), either version 3 of the License, or (at your
* option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the Affero GNU General Public License
* version 3 along with this program. If not, see http://www.gnu.org/licenses/
*
* If you have any problem about this python version code, please contact: Rong Gong
* rong.gong@upf.edu
*
* If you have any problem about this algorithm, I suggest you to contact: Matthias Mauch
* m.mauch@qmul.ac.uk who is the original C++ version author of this algorithm
*
* If you want to refer this code, please consider this article:
*
* M. Mauch and S. Dixon,
* “pYIN: A Fundamental Frequency Estimator Using Probabilistic Threshold Distributions”,
* in Proceedings of the IEEE International Conference on Acoustics,
* Speech, and Signal Processing (ICASSP 2014), 2014.
*
* M. Mauch, C. Cannam, R. Bittner, G. Fazekas, J. Salamon, J. Dai, J. Bello and S. Dixon,
* “Computer-aided Melody Note Transcription Using the Tony Software: Accuracy and Efficiency”,
* in Proceedings of the First International Conference on Technologies for
* Music Notation and Representation, 2015.
'''
import numpy as np
from SparseHMM import SparseHMM
from MonoNoteParameters import MonoNoteParameters
from math import *
from scipy.stats import norm
class MonoNoteHMM(SparseHMM):
def __init__(self):
SparseHMM.__init__(self)
self.par = MonoNoteParameters()
self.pitchDistr = []
self.build()
def calculatedObsProb(self, pitchProb):
# pitchProb is a list of pairs (pitches and their probabilities)
nCandidate = len(pitchProb)
# what is the probability of pitched
pIsPitched = 0.0
for iCandidate in range(nCandidate):
# pIsPitched = pitchProb[iCandidate].second > pIsPitched ? pitchProb[iCandidate].second : pIsPitched;
pIsPitched += pitchProb[iCandidate][1]
# the pitched probability, check Ryynanen's paper
pIsPitched = pIsPitched * (1-self.par.priorWeight) + self.par.priorPitchedProb * self.par.priorWeight
out = np.zeros((self.par.n,), dtype=np.float64)
tempProbSum = 0
for i in range(self.par.n):
if i % self.par.nSPP != 2:
# std::cerr << getMidiPitch(i) << std::endl;
tempProb = 0.0
if nCandidate > 0:
minDist = 10000.0
minDistProb = 0.0
minDistCandidate = 0
for iCandidate in range(nCandidate):
currDist = fabs(self.getMidiPitch(i)-pitchProb[iCandidate][0])
if (currDist < minDist):
minDist = currDist
minDistProb = pitchProb[iCandidate][1]
minDistCandidate = iCandidate
tempProb = pow(minDistProb, self.par.yinTrust) * self.pitchDistr[i].pdf(pitchProb[minDistCandidate][0])
else:
tempProb = 1
tempProbSum += tempProb
out[i] = tempProb
for i in range(self.par.n):
if i % self.par.nSPP != 2:
if tempProbSum > 0:
out[i] = out[i] / tempProbSum * pIsPitched
else:
# the prob of non pitched
out[i] = (1-pIsPitched) / (self.par.nPPS * self.par.nS)
return out
def getMidiPitch(self, index):
return self.pitchDistr[index].mean()
def getFrequency(self, index):
return 440 * pow(2.0, (self.pitchDistr[index].mean()-69)/12)
def build(self):
# the states are organised as follows:
# 0-2. lowest pitch
# 0. attack state
# 1. stable state
# 2. silent state
# 3-5. second-lowest pitch
# 3. attack state
# ...
# observation distributions
for iState in range(self.par.n):
self.pitchDistr.append(norm(loc=0, scale=1))
if iState % self.par.nSPP == 2:
# silent state starts tracking
self.init = np.append(self.init, np.float64(1.0/(self.par.nS * self.par.nPPS)))
else:
self.init = np.append(self.init, np.float64(0.0))
for iPitch in range(self.par.nS * self.par.nPPS):
index = iPitch * self.par.nSPP # each pitch has 3 state
mu = self.par.minPitch + iPitch * 1.0/self.par.nPPS
self.pitchDistr[index] = norm(loc=mu, scale=self.par.sigmaYinPitchAttack)
self.pitchDistr[index+1] = norm(loc=mu, scale=self.par.sigmaYinPitchStable)
self.pitchDistr[index+2] = norm(loc=mu, scale=1.0) # dummy
# this might be the note transition probability function
noteDistanceDistr = norm(loc=0, scale=self.par.sigma2Note)
for iPitch in range(self.par.nS * self.par.nPPS):
# loop through all notes and set sparse transition probabilities
index = iPitch * self.par.nSPP
# transitions from attack state
self.fromIndex = np.append(self.fromIndex, np.uint64(index))
self.toIndex = np.append(self.toIndex, np.uint64(index))
self.transProb = np.append(self.transProb, np.float64(self.par.pAttackSelftrans))
self.fromIndex = np.append(self.fromIndex, np.uint64(index))
self.toIndex = np.append(self.toIndex, np.uint64(index+1))
self.transProb = np.append(self.transProb, np.float64(1-self.par.pAttackSelftrans))
# transitions from stable state
self.fromIndex = np.append(self.fromIndex, np.uint64(index+1))
self.toIndex = np.append(self.toIndex, np.uint64(index+1)) # to itself
self.transProb = np.append(self.transProb, np.float64(self.par.pStableSelftrans))
self.fromIndex = np.append(self.fromIndex, np.uint64(index+1))
self.toIndex = np.append(self.toIndex, np.uint64(index+2)) # to silent
self.transProb = np.append(self.transProb, np.float64(self.par.pStable2Silent))
# the "easy" transitions from silent state
self.fromIndex = np.append(self.fromIndex, np.uint64(index+2))
self.toIndex = np.append(self.toIndex, np.uint64(index+2))
self.transProb = np.append(self.transProb, np.float64(self.par.pSilentSelftrans))
# the more complicated transitions from the silent
# this prob only applies to transitions from silent to non silent
# which is the note transition
probSumSilent = 0.0
tempTransProbSilent = []
for jPitch in range(self.par.nS * self.par.nPPS):
fromPitch = iPitch
toPitch = jPitch
semitoneDistance = fabs(fromPitch - toPitch) * 1.0 / self.par.nPPS
if semitoneDistance == 0 or \
(semitoneDistance > self.par.minSemitoneDistance and semitoneDistance < self.par.maxJump):
toIndex = jPitch * self.par.nSPP # note attack index
tempWeightSilent = noteDistanceDistr.pdf(semitoneDistance)
probSumSilent += tempWeightSilent
tempTransProbSilent.append(tempWeightSilent)
self.fromIndex = np.append(self.fromIndex, np.uint64(index+2)) # from a silence
self.toIndex = np.append(self.toIndex, np.uint64(toIndex)) # to an attack
for i in range(len(tempTransProbSilent)):
self.transProb = np.append(self.transProb,
np.float64(((1-self.par.pSilentSelftrans) * tempTransProbSilent[i]/probSumSilent)))
@@ -0,0 +1,65 @@
# -*- coding: utf-8 -*-
'''
* Copyright (C) 2015 Music Technology Group - Universitat Pompeu Fabra
*
* This file is part of pypYIN
*
* pypYIN is free software: you can redistribute it and/or modify it under
* the terms of the GNU Affero General Public License as published by the Free
* Software Foundation (FSF), either version 3 of the License, or (at your
* option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the Affero GNU General Public License
* version 3 along with this program. If not, see http://www.gnu.org/licenses/
*
* If you have any problem about this python version code, please contact: Rong Gong
* rong.gong@upf.edu
*
* If you have any problem about this algorithm, I suggest you to contact: Matthias Mauch
* m.mauch@qmul.ac.uk who is the original C++ version author of this algorithm
*
* If you want to refer this code, please consider this article:
*
* M. Mauch and S. Dixon,
* “pYIN: A Fundamental Frequency Estimator Using Probabilistic Threshold Distributions”,
* in Proceedings of the IEEE International Conference on Acoustics,
* Speech, and Signal Processing (ICASSP 2014), 2014.
*
* M. Mauch, C. Cannam, R. Bittner, G. Fazekas, J. Salamon, J. Dai, J. Bello and S. Dixon,
* “Computer-aided Melody Note Transcription Using the Tony Software: Accuracy and Efficiency”,
* in Proceedings of the First International Conference on Technologies for
* Music Notation and Representation, 2015.
'''
import numpy as np
class MonoNoteParameters(object):
def __init__(self):
self.minPitch = 35
self.nPPS = 3 # 3 steps per semitone
self.nS = 69
self.nSPP = 3 # states per pitch
self.n = 0
self.initPi = np.array([], dtype=np.float64)
self.pAttackSelftrans = 0.9
self.pStableSelftrans = 0.99
self.pStable2Silent = 0.01
self.pSilentSelftrans = 0.9999
self.sigma2Note = 0.7
self.maxJump = 13.0
self.pInterSelftrans = 0.0
self.priorPitchedProb = 0.7
self.priorWeight = 0.5
self.minSemitoneDistance = 0.5
self.sigmaYinPitchAttack = 5.0
self.sigmaYinPitchStable = 0.8
self.sigmaYinPitchInter = 0.1
self.yinTrust = 0.1
self.n = self.nPPS * self.nS * self.nSPP
+75
View File
@@ -0,0 +1,75 @@
# -*- coding: utf-8 -*-
'''
* Copyright (C) 2015 Music Technology Group - Universitat Pompeu Fabra
*
* This file is part of pypYIN
*
* pypYIN is free software: you can redistribute it and/or modify it under
* the terms of the GNU Affero General Public License as published by the Free
* Software Foundation (FSF), either version 3 of the License, or (at your
* option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the Affero GNU General Public License
* version 3 along with this program. If not, see http://www.gnu.org/licenses/
*
* If you have any problem about this python version code, please contact: Rong Gong
* rong.gong@upf.edu
*
* If you have any problem about this algorithm, I suggest you to contact: Matthias Mauch
* m.mauch@qmul.ac.uk who is the original C++ version author of this algorithm
*
* If you want to refer this code, please consider this article:
*
* M. Mauch and S. Dixon,
* “pYIN: A Fundamental Frequency Estimator Using Probabilistic Threshold Distributions”,
* in Proceedings of the IEEE International Conference on Acoustics,
* Speech, and Signal Processing (ICASSP 2014), 2014.
*
* M. Mauch, C. Cannam, R. Bittner, G. Fazekas, J. Salamon, J. Dai, J. Bello and S. Dixon,
* “Computer-aided Melody Note Transcription Using the Tony Software: Accuracy and Efficiency”,
* in Proceedings of the First International Conference on Technologies for
* Music Notation and Representation, 2015.
'''
from MonoPitchHMM import MonoPitchHMM
import numpy as np
from math import *
class MonoPitch(object):
def __init__(self):
self.hmm = MonoPitchHMM()
def process(self, pitchProb):
obsProb = [self.hmm.calculatedObsProb(pitchProb[0]),]
for iFrame in range(1,len(pitchProb)):
obsProb += [self.hmm.calculatedObsProb(pitchProb[iFrame])]
out = np.array([], dtype=np.float32)
path, scale = self.hmm.decodeViterbi(obsProb)
for iFrame in range(len(path)):
hmmFreq = self.hmm.m_freqs[path[iFrame]]
bestFreq = 0.0
leastDist = 10000.0
if hmmFreq > 0:
# This was a Yin estimate, so try to get original pitch estimate back
# ... a bit hacky, since we could have direclty saved the frequency
# that was assigned to the HMM bin in hmm.calculateObsProb -- but would
# have had to rethink the interface of that method.
for iPitch in range(len(pitchProb[iFrame])):
freq = 440. * pow(2.0, (pitchProb[iFrame][iPitch][0] - 69)/12.0)
dist = fabs(hmmFreq-freq)
if dist < leastDist:
leastDist = dist
bestFreq = freq
else:
bestFreq = hmmFreq
out = np.append(out, bestFreq)
return out
+136
View File
@@ -0,0 +1,136 @@
# -*- coding: utf-8 -*-
'''
* Copyright (C) 2015 Music Technology Group - Universitat Pompeu Fabra
*
* This file is part of pypYIN
*
* pypYIN is free software: you can redistribute it and/or modify it under
* the terms of the GNU Affero General Public License as published by the Free
* Software Foundation (FSF), either version 3 of the License, or (at your
* option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the Affero GNU General Public License
* version 3 along with this program. If not, see http://www.gnu.org/licenses/
*
* If you have any problem about this python version code, please contact: Rong Gong
* rong.gong@upf.edu
*
* If you have any problem about this algorithm, I suggest you to contact: Matthias Mauch
* m.mauch@qmul.ac.uk who is the original C++ version author of this algorithm
*
* If you want to refer this code, please consider this article:
*
* M. Mauch and S. Dixon,
* “pYIN: A Fundamental Frequency Estimator Using Probabilistic Threshold Distributions”,
* in Proceedings of the IEEE International Conference on Acoustics,
* Speech, and Signal Processing (ICASSP 2014), 2014.
*
* M. Mauch, C. Cannam, R. Bittner, G. Fazekas, J. Salamon, J. Dai, J. Bello and S. Dixon,
* “Computer-aided Melody Note Transcription Using the Tony Software: Accuracy and Efficiency”,
* in Proceedings of the First International Conference on Technologies for
* Music Notation and Representation, 2015.
'''
from SparseHMM import SparseHMM
from math import *
import numpy as np
class MonoPitchHMM(SparseHMM):
def __init__(self):
SparseHMM.__init__(self)
self.m_minFreq = 61.735
self.m_nBPS = 5
self.m_nPitch = 0
self.m_selfTrans = 0.99
self.m_yinTrust = 0.5
self.m_transitionWidth = 5*(np.uint64(self.m_nBPS/2)) + 1 # 2 semi-tones of frame jump
self.m_nPitch = 69 * self.m_nBPS # 69 semi-tone, each semi-tone divided to 5, step is 20 cents
self.m_freqs = np.zeros(2*self.m_nPitch, dtype=np.float64)
for iPitch in range(self.m_nPitch):
self.m_freqs[iPitch] = self.m_minFreq * pow(2, iPitch * 1.0 / (12 * self.m_nBPS)) # 0 to m_nPitch-1 positive pitch
self.m_freqs[iPitch+self.m_nPitch] = -self.m_freqs[iPitch] # m_nPitch to 2*m_nPitch-1 negative pitch
self.build()
def calculatedObsProb(self, pitchProb):
# pitchProb is the pitch candidates of one frame
out = np.zeros((2*self.m_nPitch+1,), dtype=np.float64)
probYinPitched = 0.0
# BIN THE PITCHES
for iPair in range(len(pitchProb)):
freq = 440. * pow(2.0, (pitchProb[iPair][0] - 69)/12.0)
if freq <= self.m_minFreq: continue
d = 0
oldd = 1000
for iPitch in range(self.m_nPitch):
d = fabs(freq-self.m_freqs[iPitch])
if oldd < d and iPitch > 0:
# previous bin must have been the closest
# when iPitch move far away from freq candidate
# add pitch prob to probYinPitched, break
out[iPitch-1] = pitchProb[iPair][1]
probYinPitched += out[iPitch-1]
break
oldd = d
probReallyPitched = self.m_yinTrust * probYinPitched
# damn, I forget what this is all about...
# don't understand this part, inspired by note tracking method
for iPitch in range(self.m_nPitch):
if probYinPitched > 0: out[iPitch] *= (probReallyPitched/probYinPitched) # times self.m_yinTrust
# non voiced pitch obs
# 1 - sum(pitchProb)*0.5
# this observation prob is very small, but equal for every unvoiced state
# so that the sum of them are 1 - sum(pitchProb)*0.5
out[iPitch+self.m_nPitch] = (1 - probReallyPitched) / self.m_nPitch
return out
def build(self):
# initial vector, uniform distribution
self.init = np.ones((2*self.m_nPitch), dtype=np.float64) * 1.0/2*self.m_nPitch
# transitions
for iPitch in range(self.m_nPitch):
theoreticalMinNextPitch = int(iPitch)-int(self.m_transitionWidth/2)
minNextPitch = iPitch-int(self.m_transitionWidth/2) if iPitch>self.m_transitionWidth/2 else 0
maxNextPitch = iPitch+int(self.m_transitionWidth/2) if iPitch<self.m_nPitch-self.m_transitionWidth/2 else self.m_nPitch-1
# weight vector, triangle, maximum is at iPitch
weightSum = 0
weights = np.array([], dtype=np.float64)
for i in range(minNextPitch, maxNextPitch+1):
if i <= iPitch:
weights = np.append(weights, np.float64(i-theoreticalMinNextPitch+1))
else:
weights = np.append(weights, np.float64(iPitch-theoreticalMinNextPitch+1-(i-iPitch)))
weightSum += weights[len(weights)-1]
for i in range(minNextPitch, maxNextPitch+1):
# from voiced to voiced
self.fromIndex = np.append(self.fromIndex, np.uint64(iPitch))
self.toIndex = np.append(self.toIndex, np.uint64(i))
self.transProb = np.append(self.transProb, np.float64(weights[i-minNextPitch] / weightSum * self.m_selfTrans))
# from voiced to non voiced
self.fromIndex = np.append(self.fromIndex, np.uint64(iPitch))
self.toIndex = np.append(self.toIndex, np.uint64(i+self.m_nPitch))
self.transProb = np.append(self.transProb, np.float64(weights[i-minNextPitch] / weightSum * (1-self.m_selfTrans)))
# from non voiced to non voiced
self.fromIndex = np.append(self.fromIndex, np.uint64(iPitch+self.m_nPitch))
self.toIndex = np.append(self.toIndex, np.uint64(i+self.m_nPitch))
self.transProb = np.append(self.transProb, np.float64(weights[i-minNextPitch] / weightSum * self.m_selfTrans))
# from non voiced to voiced
self.fromIndex = np.append(self.fromIndex, np.uint64(iPitch+self.m_nPitch))
self.toIndex = np.append(self.toIndex, np.uint64(i))
self.transProb = np.append(self.transProb, np.float64(weights[i-minNextPitch] / weightSum * (1-self.m_selfTrans)))
+129
View File
@@ -0,0 +1,129 @@
# -*- coding: utf-8 -*-
'''
* Copyright (C) 2015 Music Technology Group - Universitat Pompeu Fabra
*
* This file is part of pypYIN
*
* pypYIN is free software: you can redistribute it and/or modify it under
* the terms of the GNU Affero General Public License as published by the Free
* Software Foundation (FSF), either version 3 of the License, or (at your
* option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the Affero GNU General Public License
* version 3 along with this program. If not, see http://www.gnu.org/licenses/
*
* If you have any problem about this python version code, please contact: Rong Gong
* rong.gong@upf.edu
*
* If you have any problem about this algorithm, I suggest you to contact: Matthias Mauch
* m.mauch@qmul.ac.uk who is the original C++ version author of this algorithm
*
* If you want to refer this code, please consider this article:
*
* M. Mauch and S. Dixon,
* “pYIN: A Fundamental Frequency Estimator Using Probabilistic Threshold Distributions”,
* in Proceedings of the IEEE International Conference on Acoustics,
* Speech, and Signal Processing (ICASSP 2014), 2014.
*
* M. Mauch, C. Cannam, R. Bittner, G. Fazekas, J. Salamon, J. Dai, J. Bello and S. Dixon,
* “Computer-aided Melody Note Transcription Using the Tony Software: Accuracy and Efficiency”,
* in Proceedings of the First International Conference on Technologies for
* Music Notation and Representation, 2015.
'''
import numpy as np
from math import *
class SparseHMM(object):
def __init__(self):
self.init = np.array([], dtype=np.float64)
self.transProb = np.array([], dtype=np.float64)
self.fromIndex = np.array([], dtype=np.uint64)
self.toIndex = np.array([],dtype=np.uint64)
def calculatedObsProb(self, data):
# to be overloaded
return data
def decodeViterbi(self, obsProb):
if len(obsProb) < 1: return np.array([], dtype=np.int)
nState = len(self.init)
nFrame = len(obsProb)
# check for consistency
nTrans = len(self.transProb)
# declaring variables
scale = np.array([], dtype=np.float64)
delta = np.zeros((nState,), dtype=np.float64)
oldDelta = np.zeros((nState,), dtype=np.float64)
path = np.ones(nFrame, dtype=np.int) * (nState-1) # the final output path
deltasum = 0
# initialise first frame in time 1, rabiner 32a
for iState in range(nState):
oldDelta[iState] = self.init[iState] * obsProb[0][iState]
deltasum += oldDelta[iState]
for iState in range(nState):
oldDelta[iState] /= deltasum # normalise (scale)
scale = np.append(scale, np.double(1.0/deltasum))
psi = [np.zeros(nState, dtype=np.int),] # matrix of remembered indices of the best transitions
# rest of forward step
for iFrame in range(1, nFrame):
deltasum = 0
psi = psi + [np.zeros(nState, dtype=np.int)]
# calculate best previous state for every current state
# this is the "sparse" loop
for iTrans in range(nTrans):
fromState = self.fromIndex[iTrans]
toState = self.toIndex[iTrans]
currentTransProb = self.transProb[iTrans]
currentValue = oldDelta[fromState] * currentTransProb
if (currentValue > delta[toState]): # to find the maximum
delta[toState] = currentValue # just change the toState delta, will be multiplied by the right obs later!
psi[iFrame][toState] = fromState # rabiner 33b
for jState in range(nState):
delta[jState] *= obsProb[iFrame][jState]
deltasum += delta[jState]
if deltasum > 0:
for iState in range(nState):
oldDelta[iState] = delta[iState] / deltasum # normalise (scale)
delta[iState] = 0
scale = np.append(scale, np.double(1.0/deltasum))
else:
print "WARNING: Viterbi has been fed some zero probabilities, at least they become zero at frame " + str(iFrame) + " in combination with the model."
for iState in range(nState):
oldDelta[iState] = 1.0/nState
delta[iState] = 0
scale = np.append(scale, np.double(1.0/deltasum))
# initialise backward step
bestValue = 0
for iState in range(nState):
currentValue = oldDelta[iState] # use directly the normalised delta
if currentValue > bestValue:
bestValue = currentValue # rabiner 34b
path[nFrame-1] = iState # path of last frame
for iFrame in reversed(range(nFrame-1)):
path[iFrame] = psi[iFrame+1][path[iFrame+1]]
return path, scale
+120
View File
@@ -0,0 +1,120 @@
# -*- coding: utf-8 -*-
'''
* Copyright (C) 2015 Music Technology Group - Universitat Pompeu Fabra
*
* This file is part of pypYIN
*
* pypYIN is free software: you can redistribute it and/or modify it under
* the terms of the GNU Affero General Public License as published by the Free
* Software Foundation (FSF), either version 3 of the License, or (at your
* option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the Affero GNU General Public License
* version 3 along with this program. If not, see http://www.gnu.org/licenses/
*
* If you have any problem about this python version code, please contact: Rong Gong
* rong.gong@upf.edu
*
* If you have any problem about this algorithm, I suggest you to contact: Matthias Mauch
* m.mauch@qmul.ac.uk who is the original C++ version author of this algorithm
*
* If you want to refer this code, please consider this article:
*
* M. Mauch and S. Dixon,
* “pYIN: A Fundamental Frequency Estimator Using Probabilistic Threshold Distributions”,
* in Proceedings of the IEEE International Conference on Acoustics,
* Speech, and Signal Processing (ICASSP 2014), 2014.
*
* M. Mauch, C. Cannam, R. Bittner, G. Fazekas, J. Salamon, J. Dai, J. Bello and S. Dixon,
* “Computer-aided Melody Note Transcription Using the Tony Software: Accuracy and Efficiency”,
* in Proceedings of the First International Conference on Technologies for
* Music Notation and Representation, 2015.
'''
import numpy as np
from math import *
import YinUtil
class Yin(object):
def __init__(self):
self.m_frameSize = 2048
self.m_inputSampleRate = 44100
self.m_thresh = 0.2
self.m_threshDistr = 2
self.m_yinBufferSize = self.m_frameSize/2
self.m_fast = True
def Yin(self, frameSize, inputSampleRate, thresh = 0.2, fast = True):
self.m_frameSize = frameSize
self.m_inputSampleRate = inputSampleRate
self.m_thresh = thresh
self.m_threshDistr = 2
self.m_yinBufferSize = frameSize/2
self.m_fast = fast
class YinOutput(object):
def __init__(self, f0 = 0.0, periodicity = 0.0, rms = 0.0):
self.f0 = f0
self.periodicity = periodicity
self.rms = rms
self.salience = np.array([], dtype=np.float64)
self.freqProb = np.array([], dtype=np.float64)
def processProbabilisticYin(self, input):
# calculate aperiodicity function for all periods, output stores in yinBuffer
if self.m_fast:
yinBuffer = YinUtil.fastDifference(input, self.m_yinBufferSize)
else:
yinBuffer = YinUtil.slowDifference(input, self.m_yinBufferSize)
yinBuffer = YinUtil.cumulativeDifference(yinBuffer ,self.m_yinBufferSize)
peakProbability = YinUtil.yinProb(yinBuffer, self.m_threshDistr, self.m_yinBufferSize, 0, 0)
# calculate overall "probability" from peak probability, overall "probability" probSum seems never be used
rms = sqrt(YinUtil.sumSquare(input, 0, self.m_yinBufferSize)/self.m_yinBufferSize)
yo = Yin.YinOutput(0.0, 0.0, rms)
firstStack = False
for iBuf in range(self.m_yinBufferSize):
yo.salience = np.append(yo.salience, peakProbability[iBuf])
# if peakProb > 0, a fundamental frequency candidate is generated
if peakProbability[iBuf] > 0:
currentF0 = self.m_inputSampleRate * (1.0 / YinUtil.parabolicInterpolation(yinBuffer, iBuf, self.m_yinBufferSize))
if firstStack == False:
yo.freqProb = np.array([np.array([currentF0, peakProbability[iBuf]], dtype=np.float64),])
firstStack = True
else:
yo.freqProb = np.vstack((yo.freqProb, np.array([currentF0, peakProbability[iBuf]], dtype=np.float64)))
return yo
def setThreshold(self, parameter):
self.m_thresh = parameter
return 0
def setThresholdDistr(self, parameter):
self.m_threshDistr = parameter
return 0
def setFrameSize(self, parameter):
m_frameSize = parameter
m_yinBufferSize = m_frameSize/2
return 0
def setFast(self, parameter):
m_fast = parameter
return 0
+213
View File
@@ -0,0 +1,213 @@
from math import *
import numpy as np
def slowDifference(input, yinBufferSize):
yinBuffer = np.zeros((yinBufferSize,), dtype=np.float64)
startPoint = 0
endPoint = 0
for i in range(yinBufferSize):
startPoint = yinBufferSize/2 - i/2
endPoint = startPoint + yinBufferSize
for j in range(startPoint,endPoint):
delta = input[i+j] - input[j]
yinBuffer[i] += delta * delta
return yinBuffer
def fastDifference(input, yinBufferSize):
frameSize = 2 * yinBufferSize
# DECLARE AND INITIALISE
yinBuffer = np.zeros((yinBufferSize,), dtype=np.float64)
powerTerms = np.zeros((yinBufferSize,), dtype=np.float64)
kernel = np.zeros((frameSize,), dtype=np.float64)
yinStyleACFReal = np.zeros((frameSize,), dtype=np.float64)
yinStyleACFImag = np.zeros((frameSize,), dtype=np.float64)
# POWER TERM CALCULATION
# ... for the power terms in equation (7) in the Yin paper
for j in range(yinBufferSize):
powerTerms[0] += input[j] * input[j]
# now iteratively calculate all others, second term in equation (7)
for tau in range(1, yinBufferSize):
powerTerms[tau] = powerTerms[tau-1] - input[tau-1] * input[tau-1] + input[tau+yinBufferSize] * input[tau+yinBufferSize]
# YIN-STYLE AUTOCORRELATION via FFT
# 1. data
at = np.fft.fft(input, frameSize)
audioTransformedReal = at.real
audioTransformedImag = at.imag
# 2. half of the data, disguised as a convolution kernel
for j in range(yinBufferSize):
kernel[j] = input[yinBufferSize-1-j]
kt = np.fft.fft(kernel, frameSize)
kernelTransformedReal = kt.real
kernelTransformedImag = kt.imag
# 3. convolution via complex multiplication
for j in range(frameSize):
yinStyleACFReal[j] = audioTransformedReal[j]*kernelTransformedReal[j] - audioTransformedImag[j]*kernelTransformedImag[j]
yinStyleACFImag[j] = audioTransformedReal[j]*kernelTransformedImag[j] + audioTransformedImag[j]*kernelTransformedReal[j]
yinStyleACF = np.array(yinStyleACFReal, dtype=np.float64) + np.array(yinStyleACFImag, dtype=np.float64)*1j
iat = np.fft.ifft(yinStyleACF, frameSize)
# CALCULATION OF difference function
for j in range(yinBufferSize):
yinBuffer[j] = powerTerms[0] + powerTerms[j] - 2 * iat.real[j+yinBufferSize-1]
return yinBuffer
def cumulativeDifference(yinBuffer ,yinBufferSize):
yinBuffer[0] = 1.0
runningSum = 0.0
for tau in range(1, yinBufferSize):
runningSum += yinBuffer[tau]
if runningSum == 0:
yinBuffer[tau] = 1
else:
yinBuffer[tau] *= tau / runningSum
return yinBuffer
uniformDist = [0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000,0.0100000]
betaDist1 = [0.028911,0.048656,0.061306,0.068539,0.071703,0.071877,0.069915,0.066489,0.062117,0.057199,0.052034,0.046844,0.041786,0.036971,0.032470,0.028323,0.024549,0.021153,0.018124,0.015446,0.013096,0.011048,0.009275,0.007750,0.006445,0.005336,0.004397,0.003606,0.002945,0.002394,0.001937,0.001560,0.001250,0.000998,0.000792,0.000626,0.000492,0.000385,0.000300,0.000232,0.000179,0.000137,0.000104,0.000079,0.000060,0.000045,0.000033,0.000024,0.000018,0.000013,0.000009,0.000007,0.000005,0.000003,0.000002,0.000002,0.000001,0.000001,0.000001,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000]
betaDist2 = [0.012614,0.022715,0.030646,0.036712,0.041184,0.044301,0.046277,0.047298,0.047528,0.047110,0.046171,0.044817,0.043144,0.041231,0.039147,0.036950,0.034690,0.032406,0.030133,0.027898,0.025722,0.023624,0.021614,0.019704,0.017900,0.016205,0.014621,0.013148,0.011785,0.010530,0.009377,0.008324,0.007366,0.006497,0.005712,0.005005,0.004372,0.003806,0.003302,0.002855,0.002460,0.002112,0.001806,0.001539,0.001307,0.001105,0.000931,0.000781,0.000652,0.000542,0.000449,0.000370,0.000303,0.000247,0.000201,0.000162,0.000130,0.000104,0.000082,0.000065,0.000051,0.000039,0.000030,0.000023,0.000018,0.000013,0.000010,0.000007,0.000005,0.000004,0.000003,0.000002,0.000001,0.000001,0.000001,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000]
betaDist3 = [0.006715,0.012509,0.017463,0.021655,0.025155,0.028031,0.030344,0.032151,0.033506,0.034458,0.035052,0.035331,0.035332,0.035092,0.034643,0.034015,0.033234,0.032327,0.031314,0.030217,0.029054,0.027841,0.026592,0.025322,0.024042,0.022761,0.021489,0.020234,0.019002,0.017799,0.016630,0.015499,0.014409,0.013362,0.012361,0.011407,0.010500,0.009641,0.008830,0.008067,0.007351,0.006681,0.006056,0.005475,0.004936,0.004437,0.003978,0.003555,0.003168,0.002814,0.002492,0.002199,0.001934,0.001695,0.001481,0.001288,0.001116,0.000963,0.000828,0.000708,0.000603,0.000511,0.000431,0.000361,0.000301,0.000250,0.000206,0.000168,0.000137,0.000110,0.000088,0.000070,0.000055,0.000043,0.000033,0.000025,0.000019,0.000014,0.000010,0.000007,0.000005,0.000004,0.000002,0.000002,0.000001,0.000001,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000]
betaDist4 = [0.003996,0.007596,0.010824,0.013703,0.016255,0.018501,0.020460,0.022153,0.023597,0.024809,0.025807,0.026607,0.027223,0.027671,0.027963,0.028114,0.028135,0.028038,0.027834,0.027535,0.027149,0.026687,0.026157,0.025567,0.024926,0.024240,0.023517,0.022763,0.021983,0.021184,0.020371,0.019548,0.018719,0.017890,0.017062,0.016241,0.015428,0.014627,0.013839,0.013068,0.012315,0.011582,0.010870,0.010181,0.009515,0.008874,0.008258,0.007668,0.007103,0.006565,0.006053,0.005567,0.005107,0.004673,0.004264,0.003880,0.003521,0.003185,0.002872,0.002581,0.002312,0.002064,0.001835,0.001626,0.001434,0.001260,0.001102,0.000959,0.000830,0.000715,0.000612,0.000521,0.000440,0.000369,0.000308,0.000254,0.000208,0.000169,0.000136,0.000108,0.000084,0.000065,0.000050,0.000037,0.000027,0.000019,0.000014,0.000009,0.000006,0.000004,0.000002,0.000001,0.000001,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000]
single10 = [0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,1.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000]
single15 = [0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,1.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000]
single20 = [0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,1.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000]
def yinProb(yinBuffer, prior, yinBufferSize, minTau0, maxTau0):
minTau = 2
maxTau = yinBufferSize
# adapt period range, if necessary
if minTau0 > 0 and minTau0 < maxTau0: minTau = minTau0
if maxTau0 > 0 and maxTau0 < yinBufferSize and maxTau0 > minTau: maxTau = maxTau0
minWeight = 0.01
thresholds = np.array([], dtype=np.float32)
distribution = np.array([], dtype=np.float32)
peakProb = np.zeros((yinBufferSize,), dtype=np.float64)
nThreshold = 100
nThresholdInt = nThreshold
for i in range(nThresholdInt):
thresholds = np.append(thresholds, np.double(0.01 + i*0.01))
if prior == 0:
distribution = np.append(distribution, uniformDist[i])
elif prior == 1:
distribution = np.append(distribution, betaDist1[i])
elif prior == 2:
distribution = np.append(distribution, betaDist2[i])
elif prior == 3:
distribution = np.append(distribution, betaDist3[i])
elif prior == 4:
distribution = np.append(distribution, betaDist4[i])
elif prior == 5:
distribution = np.append(distribution, single10[i])
elif prior == 6:
distribution = np.append(distribution, single15[i])
elif prior == 7:
distribution = np.append(distribution, single20[i])
else:
distribution = np.append(distribution, uniformDist[i])
tau = minTau
minInd = 0
minVal = 42.0
sumProb = 0.0
while tau+1 < maxTau:
# yinBuffer < 1 && ...
if yinBuffer[tau] < thresholds[len(thresholds)-1] and yinBuffer[tau+1] < yinBuffer[tau]:
# search for all dip points
while tau + 1 < maxTau and yinBuffer[tau+1] < yinBuffer[tau]:
tau += 1
# tau is now local minimum,
# because it's the turning point from yinBuffer[tau+1] < yinBuffer[tau] to yinBuffer[tau+1] >= yinBuffer[tau]
if yinBuffer[tau] < minVal and tau > 2:
minVal = yinBuffer[tau] # mininum d'
minInd = tau
currThreshInd = nThresholdInt-1
# formula (4), the threshold is on y-axis, the probability of P is the cumulation of distribution
# when d'(tau) < threshold
while thresholds[currThreshInd] > yinBuffer[tau] and currThreshInd > -1:
peakProb[tau] += distribution[currThreshInd]
currThreshInd -= 1
sumProb += peakProb[tau]
tau += 1
else:
tau += 1
if peakProb[minInd] > 1:
print "WARNING: yin has prob > 1 ??? I'm returning all zeros instead."
return np.zeros((yinBufferSize,), dtype=np.float64)
nonPeakProb = 1.0
if sumProb > 0:
for i in range(minTau, maxTau):
# nomalization, the max prob will be peakProb[minInd]
peakProb[i] = peakProb[i] / sumProb * peakProb[minInd]
nonPeakProb -= peakProb[i]
if minInd > 0:
# adds nonPeakProb only for the prob with minimum d(tau)
# because here we have a small threshold s, for all tau d'(tau) > s
# we choose tau as the index of global minimum of d'
peakProb[minInd] += nonPeakProb * minWeight
return peakProb
def parabolicInterpolation(yinBuffer, tau, yinBufferSize):
# this is taken almost literally from Joren Six's Java implementation
if tau == yinBufferSize: # not valid anyway.
return tau
betterTau = 0.0
if tau > 0 and tau < yinBufferSize-1:
s0 = yinBuffer[tau-1]
s1 = yinBuffer[tau]
s2 = yinBuffer[tau+1]
adjustment = (s2 - s0) / (2 * (2 * s1 - s2 - s0))
if np.fabs(adjustment)>1: adjustment = 0
betterTau = tau + adjustment
else:
#print "WARNING: can't do interpolation at the edge (tau = " + str(tau) + "), will return un-interpolated value.\n"
betterTau = tau
return betterTau
def sumSquare(input, start, end):
out = 0.0
for i in range(start,end):
out += input[i] * input[i]
return out
def RMS(inputBuffers, blockSize):
rms = 0.0
for i in range(blockSize):
rms += inputBuffers[i] * inputBuffers[i]
rms /= blockSize
rms = sqrt(rms)
return rms
+103
View File
@@ -0,0 +1,103 @@
# -*- coding: utf-8 -*-
'''
* Copyright (C) 2015 Music Technology Group - Universitat Pompeu Fabra
*
* This file is part of pypYIN
*
* pypYIN is free software: you can redistribute it and/or modify it under
* the terms of the GNU Affero General Public License as published by the Free
* Software Foundation (FSF), either version 3 of the License, or (at your
* option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the Affero GNU General Public License
* version 3 along with this program. If not, see http://www.gnu.org/licenses/
*
* If you have any problem about this python version code, please contact: Rong Gong
* rong.gong@upf.edu
*
* If you have any problem about this algorithm, I suggest you to contact: Matthias Mauch
* m.mauch@qmul.ac.uk who is the original C++ version author of this algorithm
*
* If you want to refer this code, please consider this article:
*
* M. Mauch and S. Dixon,
* “pYIN: A Fundamental Frequency Estimator Using Probabilistic Threshold Distributions”,
* in Proceedings of the IEEE International Conference on Acoustics,
* Speech, and Signal Processing (ICASSP 2014), 2014.
*
* M. Mauch, C. Cannam, R. Bittner, G. Fazekas, J. Salamon, J. Dai, J. Bello and S. Dixon,
* “Computer-aided Melody Note Transcription Using the Tony Software: Accuracy and Efficiency”,
* in Proceedings of the First International Conference on Technologies for
* Music Notation and Representation, 2015.
'''
import os, sys
import pYINmain
import essentia.standard as ess
import numpy as np
from YinUtil import RMS
def pYINPtNote(filename1,fs=44100,frameSize=2048,hopSize=256):
'''
Given filename, return pitchtrack and note transcription track
:param filename1:
:param fs:
:param frameSize:
:param hopSize:
:return:
'''
# initialise
pYinInst = pYINmain.PyinMain()
pYinInst.initialise(channels = 1, inputSampleRate = fs, stepSize = hopSize, blockSize = frameSize,
lowAmp = 0.25, onsetSensitivity = 0.7, pruneThresh = 0.1)
# frame-wise calculation
audio = ess.MonoLoader(filename = filename1, sampleRate = fs)()
# rms mean
# rms = []
# for frame in ess.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize):
# rms.append(RMS(frame, frameSize))
# rmsMean = np.mean(rms)
# print 'rmsMean', rmsMean
for frame in ess.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize):
fs = pYinInst.process(frame)
# calculate smoothed pitch and mono note
monoPitch = pYinInst.getSmoothedPitchTrack()
# output smoothed pitch track
print 'pitch track'
for ii in fs.m_oSmoothedPitchTrack:
print ii.values
print '\n'
fs = pYinInst.getRemainingFeatures(monoPitch)
# output of mono notes,
# column 0: frame number,
# column 1: pitch in midi numuber, this is the decoded pitch
# column 2: attack 1, stable 2, silence 3
print 'mono note decoded pitch'
for ii in fs.m_oMonoNoteOut:
print ii.frameNumber, ii.pitch, ii.noteState
print '\n'
print 'note pitch tracks'
for ii in fs.m_oNotePitchTracks:
print ii
print '\n'
# median pitch in Hz of the notes
print 'median note pitch'
for ii in fs.m_oNotes:
print ii.values
print '\n'
+278
View File
@@ -0,0 +1,278 @@
# -*- coding: utf-8 -*-
'''
* Copyright (C) 2015 Music Technology Group - Universitat Pompeu Fabra
*
* This file is part of pypYIN
*
* pypYIN is free software: you can redistribute it and/or modify it under
* the terms of the GNU Affero General Public License as published by the Free
* Software Foundation (FSF), either version 3 of the License, or (at your
* option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the Affero GNU General Public License
* version 3 along with this program. If not, see http://www.gnu.org/licenses/
*
* If you have any problem about this python version code, please contact: Rong Gong
* rong.gong@upf.edu
*
* If you have any problem about this algorithm, I suggest you to contact: Matthias Mauch
* m.mauch@qmul.ac.uk who is the original C++ version author of this algorithm
*
* If you want to refer this code, please consider this article:
*
* M. Mauch and S. Dixon,
* “pYIN: A Fundamental Frequency Estimator Using Probabilistic Threshold Distributions”,
* in Proceedings of the IEEE International Conference on Acoustics,
* Speech, and Signal Processing (ICASSP 2014), 2014.
*
* M. Mauch, C. Cannam, R. Bittner, G. Fazekas, J. Salamon, J. Dai, J. Bello and S. Dixon,
* “Computer-aided Melody Note Transcription Using the Tony Software: Accuracy and Efficiency”,
* in Proceedings of the First International Conference on Technologies for
* Music Notation and Representation, 2015.
'''
import numpy as np
import copy
from math import *
from Yin import *
from YinUtil import RMS
from MonoPitch import MonoPitch
from MonoNote import MonoNote
class Feature(object):
def __init__(self):
self.values = np.array([], dtype=np.float64)
def resetValues(self):
self.values = np.array([], dtype=np.float64)
class FeatureSet(object):
def __init__(self):
self.m_oF0Candidates = []
self.m_oF0Probs = []
self.m_oVoicedProb = []
self.m_oCandidateSalience = []
self.m_oSmoothedPitchTrack = []
self.m_oMonoNoteOut = []
self.m_oNotes = []
self.m_oNotePitchTracks = []
class PyinMain(object):
def __init__(self):
self.m_channels = 0
self.m_stepSize = 256
self.m_blockSize = 2048
self.m_inputSampleRate = 44100
self.m_fmin = 40
self.m_fmax = 1600
self.m_yin = Yin()
self.m_threshDistr = 2.0
self.m_outputUnvoiced = 2
self.m_preciseTime = 0.0
self.m_lowAmp = 0.1
self.m_onsetSensitivity = 0.7
self.m_pruneThresh = 0.1
self.m_pitchProb = []
self.m_level = np.array([], dtype=np.float32)
self.fs = FeatureSet()
def initialise(self, channels = 1, inputSampleRate = 44100, stepSize = 256, blockSize = 2048,
lowAmp = 0.1, onsetSensitivity = 0.7, pruneThresh = 0.1 ):
if channels != 1:
return False
self.m_channels = channels
self.m_inputSampleRate = inputSampleRate
self.m_stepSize = stepSize
self.m_blockSize = blockSize
self.m_lowAmp = lowAmp
self.m_onsetSensitivity = onsetSensitivity
self.m_pruneThresh = pruneThresh
self.reset()
return True
def reset(self):
self.m_yin.setThresholdDistr(self.m_threshDistr)
self.m_yin.setFrameSize(self.m_blockSize)
self.m_yin.setFast(not self.m_preciseTime)
self.m_pitchProb = np.array([], dtype=np.float64)
self.m_level = np.array([], dtype=np.float32)
def process(self, inputBuffers):
dInputBuffers = np.zeros((self.m_blockSize,), dtype=np.float64)
for i in range(self.m_blockSize):
dInputBuffers[i] = inputBuffers[i]
rms = RMS(inputBuffers, self.m_blockSize)
isLowAmplitude = rms < self.m_lowAmp
yo = self.m_yin.processProbabilisticYin(dInputBuffers)
self.m_level = np.append(self.m_level, yo.rms)
'''
First, get the things out of the way that we don't want to output
immediately, but instead save for later
'''
tempPitchProb = np.array([], dtype=np.float32)
firstStack = False
for iCandidate in range(yo.freqProb.shape[0]):
tempPitch = 12.0 * log(yo.freqProb[iCandidate][0]/440.0)/log(2.0) + 69.0
if not isLowAmplitude:
if firstStack == False:
tempPitchProb = np.array([np.array([tempPitch, yo.freqProb[iCandidate][1]], dtype=np.float64),])
firstStack = True
else:
tempPitchProb = np.vstack((tempPitchProb, np.array([tempPitch, yo.freqProb[iCandidate][1]], dtype=np.float64)))
else:
factor = ((rms+0.01*self.m_lowAmp)/(1.01*self.m_lowAmp))
if firstStack == False:
tempPitchProb = np.array([np.array([tempPitch, yo.freqProb[iCandidate][1]*factor], dtype=np.float64),])
firstStack = True
else:
tempPitchProb = np.vstack((tempPitchProb, np.array([tempPitch, yo.freqProb[iCandidate][1]*factor], dtype=np.float64)))
if len(self.m_pitchProb) < 1 and len(tempPitchProb) > 0:
self.m_pitchProb = [tempPitchProb,]
elif len(self.m_pitchProb) >= 1:
self.m_pitchProb = self.m_pitchProb + [tempPitchProb]
# f0 CANDIDATES
f = Feature()
for i in range(yo.freqProb.shape[0]):
f.values = np.append(f.values, yo.freqProb[i][0])
self.fs.m_oF0Candidates.append(copy.copy(f))
f.resetValues()
voicedProb = 0.0
for i in range(yo.freqProb.shape[0]):
f.values = np.append(f.values, yo.freqProb[i][1])
voicedProb += yo.freqProb[i][1]
self.fs.m_oF0Probs.append(copy.copy(f))
f.values = np.append(f.values, voicedProb)
self.fs.m_oVoicedProb.append(copy.copy(f))
# SALIENCE -- maybe this should eventually disappear
f.resetValues()
salienceSum = 0.0
for iBin in range(yo.salience.shape[0]):
f.values = np.append(f.values, yo.salience[iBin])
salienceSum += yo.salience[iBin]
self.fs.m_oCandidateSalience.append(copy.copy(f))
return self.fs
def getSmoothedPitchTrack(self):
f = Feature()
if len(self.m_pitchProb) == 0:
return self.fs
# MONO-PITCH STUFF
mp = MonoPitch()
mpOut = mp.process(self.m_pitchProb)
for iFrame in range(len(mpOut)):
if mpOut[iFrame] < 0 and self.m_outputUnvoiced == 0:
continue
f.resetValues()
if self.m_outputUnvoiced == 1:
f.values = np.append(f.values, np.fabs(mpOut[iFrame]))
else:
f.values = np.append(f.values, mpOut[iFrame])
self.fs.m_oSmoothedPitchTrack.append(copy.copy(f))
return mpOut
def getRemainingFeatures(self,mpOut):
f = Feature()
if len(mpOut) == 0:
return self.fs
# if len(self.m_pitchProb) == 0:
# return self.fs
#
# # MONO-PITCH STUFF
# mp = MonoPitch()
# mpOut = mp.process(self.m_pitchProb)
# for iFrame in range(len(mpOut)):
# if mpOut[iFrame] < 0 and self.m_outputUnvoiced == 0:
# continue
# f.resetValues()
# if self.m_outputUnvoiced == 1:
# f.values = np.append(f.values, np.fabs(mpOut[iFrame]))
# else:
# f.values = np.append(f.values, mpOut[iFrame])
#
# self.fs.m_oSmoothedPitchTrack.append(copy.copy(f))
# MONO-NOTE STUFF
mn = MonoNote()
smoothedPitch = []
for iFrame in range(len(mpOut)):
temp = []
if mpOut[iFrame] > 0: # negative value: silence
tempPitch = 12 * log(mpOut[iFrame]/440.0)/log(2.0) + 69
temp += [[tempPitch, 0.9]]
smoothedPitch += [temp]
mnOut = mn.process(smoothedPitch)
self.fs.m_oMonoNoteOut = mnOut
# turning feature into a note feature
f.resetValues()
onsetFrame = 0
isVoiced = 0
oldIsVoiced = 0
nFrame = len(self.m_pitchProb)
minNoteFrames = (self.m_inputSampleRate*self.m_pruneThresh)/self.m_stepSize
notePitchTrack = np.array([], dtype=np.float32) # collects pitches for one note at a time
for iFrame in range(nFrame):
isVoiced = mnOut[iFrame].noteState < 3 \
and len(smoothedPitch[iFrame]) > 0 \
and (iFrame >= nFrame-2 or (self.m_level[iFrame]/self.m_level[iFrame+2]>self.m_onsetSensitivity))
if isVoiced and iFrame != nFrame-1:
if oldIsVoiced == 0: # beginning of the note
onsetFrame = iFrame
pitch = smoothedPitch[iFrame][0][0]
notePitchTrack = np.append(notePitchTrack, pitch) # add to the note's pitch
else: # not currently voiced
if oldIsVoiced == 1: # end of the note
if len(notePitchTrack) >= minNoteFrames:
notePitchTrack = np.sort(notePitchTrack)
medianPitch = notePitchTrack[int(len(notePitchTrack)/2)]
medianFreq = pow(2, (medianPitch-69)/12)*440
f.resetValues()
f.values = np.append(f.values, np.double(medianFreq))
self.fs.m_oNotes.append(copy.copy(f))
self.fs.m_oNotePitchTracks.append(copy.copy(notePitchTrack))
notePitchTrack = np.array([], dtype=np.float32)
oldIsVoiced = isVoiced
return self.fs
+31 -4
View File
@@ -396,7 +396,10 @@ class AudioFile(object):
grain_size = int(grain_size)
position = self.get_seek_position()
# Read grain
index = self.pysndfile_object.seek(start_index, 0)
try:
index = self.pysndfile_object.seek(start_index, 0)
except:
pdb.set_trace()
if index + grain_size > self.get_frames():
grain = self.read_frames(self.get_frames() - index)
if padding:
@@ -693,8 +696,8 @@ class AudioFile(object):
- overlap: the factor by which grains overlap (integer)
"""
length = self.samps_to_ms(self.frames)
hop_size = grain_length / overlap
length = self.frames
hop_size = int(np.floor(grain_length / overlap))
grain_count = int(length / hop_size) - 1
times = np.arange(grain_count).reshape(-1, 1)
times = np.hstack((times, times)).astype(np.dtype('float64'))
@@ -715,9 +718,33 @@ class AudioFile(object):
"AnalysedAudioFile.generate_grain_times(grain_size, "
"overlap, save_times=True)")
grain_times = self.times[key].copy()
grain_times *= (self.samplerate / 1000)
return self.read_grain(start_index=grain_times[0], grain_size=grain_times[1]-grain_times[0])
def read_extended_grain(self, key, extra_length=0):
"""
Allow for grains to be retreived by indexing with zero padding after grain times have been generated.
"""
if self.times == None:
raise IndexError("AudioFile object grain times must be generated "
"before grains can be accesed by index. Try running "
"AnalysedAudioFile.generate_grain_times(grain_size, "
"overlap, save_times=True)")
grain_times = self.times[key].copy()
start = int(grain_times[0]-extra_length)
zpad = 0
if start < 0:
zpad = abs(start)
start = 0
grain_size = (grain_times[1]-start)+extra_length
grain = self.read_grain(start_index=start, grain_size=grain_size, padding=True)
grain = np.pad(
grain,
(zpad, 0),
'constant',
constant_values=(0, 0)
)
return grain
@staticmethod
def gen_window(window_type, window_size, sym=True):
-1
View File
@@ -127,7 +127,6 @@ def parse_arguments():
"variance",
"kurtosis",
"skewness",
"harm_ratio"
]
parser.add_argument(
+25 -23
View File
@@ -2,13 +2,13 @@
rms = {
# Analysis window sizes can be changed for each analysis individually.
# These do not need to match the grain size of the matcher or synthesis.
"window_size": 100,
"window_size": 1024,
"overlap": 4,
}
f0 = {
"window_size": 4096,
"overlap": 4,
"window_size": 2048,
"overlap": 2,
# Currently all frames below this ratio are digaurded and left as silence.
# Different databases will require different values for the best results.
# Noisier databases will need lower values than more tonal databases.
@@ -17,26 +17,26 @@ f0 = {
# Specify analysis parameters for variance analysis.
variance = {
"window_size": 100,
"window_size": 1024,
"overlap": 4
}
# Specify analysis parameters for temporal kurtosis analysis.
kurtosis = {
"window_size": 100,
"window_size": 1024,
"overlap": 4
}
# Specify analysis parameters for temporal skewness analysis.
skewness = {
"window_size": 100,
"window_size": 1024,
"overlap": 4
}
# Specify analysis parameters for FFT analysis.
fft = {
# The FFT window size determines the window size for all spectral analyses.
"window_size": 4096
"window_size": 2048
}
database = {
@@ -48,20 +48,20 @@ database = {
# Sets the weighting for each analysis. a higher weighting gives an analysis
# higher presendence when finding the best matches.
matcher_weightings = {
"f0" : 8,
"f0" : 1,
"spccntr" : 1.,
"spcsprd" : 1.,
"spcflux" : 3.,
"spccf" : 3.,
"spcflatness": 3.,
"spcflatness": 1.,
"zerox" : 1.,
"rms" : 0.1,
"peak": 0.1,
"centroid": 0.5,
"kurtosis": 2.,
"skewness": 2.,
"rms" : 1,
"peak": 1,
"centroid": 0.0,
"kurtosis": 0.,
"skewness": 0.,
"variance": 0.,
"harm_ratio": 2
"harm_ratio": 0
}
# Specifies the method for averaging analysis frames to create a single value
@@ -99,11 +99,11 @@ matcher = {
# This value must be the same as the synthesis grain size to avoid the
# speeding up or slowing down of the resulting file in relation to the
# original.
"grain_size": 100,
"overlap": 4,
"grain_size": 2048,
"overlap": 2,
# Defines the number of matches to keep for synthesis. Note that this must
# also be specified in the synthesis config
"match_quantity": 2,
"match_quantity": 10,
# Choose the algorithm used to perform matching. kdtree is recommended for
# larger datasets.
"method": 'kdtree'
@@ -114,20 +114,22 @@ synthesizer = {
# between source and target.
"enforce_intensity": True,
# Specify the ratio limit that is the grain can be scaled by.
"enf_intensity_ratio_limit": 1000.,
"enf_intensity_ratio_limit": 20.,
# Artificially modify the pitch by the difference in f0 values between
# source and target.
"enforce_f0": True,
# Specify the ratio limit that is the grain can be modified by.
"enf_f0_ratio_limit": 1.,
"grain_size": 100,
"overlap": 4,
"enf_f0_ratio_limit": 10.,
"grain_size": 2048,
"overlap": 2,
# Normalize output, avoid clipping of final output by scaling the final
# frames.
"normalize" : False,
# Defines the number of potential grains to choose from matches when
# synthesizing output.
"match_quantity": 2
"match_quantity": 1,
"silence_inharmonics": True
}
# Specifies the format for the output file. Changing this has not been tested
Executable → Regular
View File
+154 -63
View File
@@ -82,7 +82,6 @@ class AudioDatabase:
'variance',
'kurtosis',
'skewness',
'harm_ratio'
}
for analysis in analysis_list:
if analysis not in valid_analyses:
@@ -362,7 +361,7 @@ class Matcher:
grain_indexes = np.empty((entry_count, 2))
for ind, entry in enumerate(database.analysed_audio):
length = entry.samps_to_ms(entry.frames)
length = entry.frames
hop_size = grain_length / overlap
grain_indexes[ind][0] = int(length / hop_size) - 1
grain_indexes[:, 1] = np.cumsum(grain_indexes[:, 0]).astype(int)
@@ -415,7 +414,7 @@ class Matcher:
# Create an array of grain times for target sample
target_times = target_entry.times
x_size = target_times.shape[0]
match_indexes = np.empty((x_size, self.match_quantity))
match_indexes = np.empty((x_size, self.match_quantity), dtype=int)
match_vals = np.empty((x_size, self.match_quantity))
match_vals.fill(np.inf)
@@ -453,7 +452,6 @@ class Matcher:
source_data *= weightings[analysis]
all_source_analyses[i] = source_data
# Impute values for Nans
nan_columns = np.all(np.isnan(all_source_analyses), axis=0)
all_source_analyses[:, nan_columns] = 0.
@@ -470,7 +468,8 @@ class Matcher:
vals_append = np.append(match_vals, results_vals, axis=1)
vals_sort = np.argsort(vals_append)
inds_append = np.append(match_indexes, results_inds+source_sample_indexes[sind][0], axis=1)
#TODO: Check that this minus 1 should be there...
inds_append = np.append(match_indexes, results_inds+int(source_sample_indexes[sind][0]), axis=1)
m = np.arange(len(vals_append))[:, np.newaxis]
best_match_inds = inds_append[m, vals_sort]
@@ -480,6 +479,90 @@ class Matcher:
match_grain_inds = self.calculate_db_inds(match_indexes, source_sample_indexes)
###################################################################
# Find optimum continuity between selection of best matches per
# grain
###################################################################
# Initialize empty array to store final match indexes
final_indexes = np.array([])
# For every match found
for h, match_indexes in enumerate(match_grain_inds[:-1]):
self.logger.info("Calculating grain distances for grain: {0} of {1}".format(h, match_grain_inds.shape[0]))
# Initialise empty array to store analysis values for all of
# the current grain's matches
all_source_analyses = np.zeros((len(self.matcher_analyses), self.match_quantity))
# For every analysis generated...
for i, analysis in enumerate(self.matcher_analyses):
# Find the style of formatting used for reducing analysis
# frames to a single normalized value
analysis_formatting = self.analysis_dict[analysis]
# For each of the current grain's matches...
for j, (db_ind, grain_ind) in enumerate(match_indexes):
# Get the start and end time (in samples) for the
# current grain
grain_time = self.source_db.analysed_audio[db_ind].times[grain_ind-1]
# Get the normalised analysis value for the current
# grain
analysis_val, s = self.source_db.analysed_audio[db_ind].analysis_data_grains(grain_time, analysis, format=analysis_formatting)
# Weight the analysis by weighting set by user
analysis_val *= weightings[analysis]
# Add grain's analysis to
all_source_analyses[i][j] = analysis_val
# get the next grain's match indexes...
next_grain_indexes = match_grain_inds[h+1]
# Initialise empty array to store analysis values for all of
# the next grain's matches
next_grain_analyses = np.zeros((len(self.matcher_analyses), self.match_quantity))
# For every analysis generated...
for i, analysis in enumerate(self.matcher_analyses):
# Find the style of formatting used for reducing analysis
# frames to a single normalized value
analysis_formatting = self.analysis_dict[analysis]
# For each of the next grain's matches...
for j, (db_ind, grain_ind) in enumerate(next_grain_indexes):
# Get the start and end time (in samples) for the
# next grain
grain_time = self.source_db.analysed_audio[db_ind].times[grain_ind-1]
analysis_val, s = self.source_db.analysed_audio[db_ind].analysis_data_grains(grain_time, analysis, format=analysis_formatting)
analysis_val *= weightings[analysis]
next_grain_analyses[i][j] = analysis_val
# Impute values for Nans
nan_columns = np.all(np.isnan(all_source_analyses), axis=0)
all_source_analyses[:, nan_columns] = 0.
all_source_analyses = imp.fit_transform(all_source_analyses)
# Impute values for Nans
nan_columns = np.all(np.isnan(next_grain_analyses), axis=0)
next_grain_analyses[:, nan_columns] = 0.
next_grain_analyses = imp.fit_transform(next_grain_analyses)
source_tree = spatial.cKDTree(all_source_analyses.T, leafsize=100)
# Return array of distances and indexes for matches in the next
# grain that are closest to matches in the current grains.
results_vals, results_inds = source_tree.query(next_grain_analyses.T, k=1, p=2)
# Make sure arrays are in the correct orientation
if len(results_vals.shape) < 2:
results_vals = np.array([results_vals]).T
results_inds = np.array([results_inds]).T
# If the distance was calculated between the first and second
# grains, add the first index as well as the second
if not final_indexes.size:
a = np.argmax(results_vals)
final_indexes = np.vstack((match_indexes[a], match_indexes[results_inds[a]]))
else:
a = np.argmax(results_vals)
final_indexes = np.append(final_indexes, match_indexes[results_inds[a]], axis=0)
match_grain_inds = final_indexes
###################################################################
datafile_path = ''.join(("match/", target_entry.name))
try:
self.output_db.data[datafile_path] = match_grain_inds
@@ -738,7 +821,7 @@ class Matcher:
if not np.all(np.any(x, axis=1)):
raise ValueError("Not all match indexes have a corresponding sample index. This shouldn't happen...\n"
"Check that all database path arguments are correct then try re-running with the --rematch and --reanalyse flags.\n"
"If this does'nt work, delete the audio and data directories in all databases and try again...")
"If this doesn't work, delete the audio and data directories in all databases and try again...")
x = x.reshape(mi_shape[0], mi_shape[1], x.shape[1])
x = np.argmax(x, axis=2)
@@ -819,63 +902,70 @@ class Synthesizer:
format=output_config["format"],
channels=output_config["channels"]
) as output:
hop_size = (grain_size / overlap) * output.samplerate/1000
_grain_size *= int(output.samplerate / 1000)
hop_size = int(np.floor(grain_size / overlap))
output_frames = np.zeros(_grain_size*2 + (int(hop_size*len(grain_matches))))
offset = 0
for target_grain_ind, matches in enumerate(grain_matches):
# If there are multiple matches, choose a match at random
# from available matches.
match_index = np.random.randint(matches.shape[0])
match_db_ind, match_grain_ind = matches[match_index]
with self.match_db.analysed_audio[match_db_ind] as match_sample:
self.logger.info("Synthesizing grain:\n"
"Source sample: {0}\n"
"Source grain index: {1}\n"
"Target output: {2}\n"
"Target grain index: {3} out of {4}".format(
match_sample,
match_grain_ind,
output_name,
target_grain_ind,
len(grain_matches)
))
match_sample.generate_grain_times(match_grain_size, match_overlap, save_times=True)
#match_index = np.random.randint(matches.shape[0])
match_db_ind, match_grain_ind = matches
try:
with self.match_db.analysed_audio[match_db_ind] as match_sample:
self.logger.info("Synthesizing grain:\n"
"Source sample: {0}\n"
"Source grain index: {1}\n"
"Target output: {2}\n"
"Target grain index: {3} out of {4}".format(
match_sample,
match_grain_ind,
output_name,
target_grain_ind,
len(grain_matches)
))
match_sample.generate_grain_times(match_grain_size, match_overlap, save_times=True)
# TODO: Make proper fix for grain index offset of 1
try:
match_grain = match_sample[match_grain_ind-1]
except:
pdb.set_trace()
# TODO: Make proper fix for grain index offset of 1
# match_grain = match_sample[match_grain_ind-1]
extra_length = match_sample.get_samplerate()
match_grain = match_sample.read_extended_grain(match_grain_ind-1, extra_length=extra_length)
###################################################################
# Adjust current grain overlap to correlate with previous grain
###################################################################
if self.enforce_intensity_bool:
# Get the target sample from the database
target_sample = self.target_db[job_ind]
if self.enforce_intensity_bool:
# Get the target sample from the database
target_sample = self.target_db[job_ind]
# Calculate garin times for sample to allow for
# indexing.
target_sample.generate_grain_times(match_grain_size, match_overlap, save_times=True)
# Calculate garin times for sample to allow for
# indexing.
target_sample.generate_grain_times(match_grain_size, match_overlap, save_times=True)
match_grain = self.enforce_intensity(match_grain, match_sample, match_grain_ind, target_sample, target_grain_ind)
match_grain = self.enforce_intensity(match_grain, match_sample, match_grain_ind, target_sample, target_grain_ind)
if self.enforce_f0_bool:
# Get the target sample from the database
target_sample = self.target_db[job_ind]
if self.enforce_f0_bool:
# Get the target sample from the database
target_sample = self.target_db[job_ind]
# Calculate grain times for sample to allow for
# indexing.
target_sample.generate_grain_times(match_grain_size, match_overlap, save_times=True)
# Calculate grain times for sample to allow for
# indexing.
target_sample.generate_grain_times(match_grain_size, match_overlap, save_times=True)
match_grain = self.enforce_pitch(match_grain, match_sample, match_grain_ind, target_sample, target_grain_ind)
# TODO: Fix occasional output of grain size one
# sample larger than it should be
match_grain = self.enforce_pitch(match_grain, match_sample, match_grain_ind, target_sample, target_grain_ind)
match_grain = match_grain[extra_length:-extra_length]
# Apply hanning window to grain
match_grain *= np.hanning(match_grain.size)
try:
output_frames[offset:offset+match_grain.size] += match_grain
except:
pass
offset += hop_size
# Apply hanning window to grain
match_grain *= np.hanning(match_grain.size)
try:
output_frames[offset:offset+match_grain.size] += match_grain
except:
pass
offset += hop_size
except:
pdb.set_trace()
# If output normalization is active, normalize output.
if self.config.synthesizer["normalize"]:
output_frames = (output_frames / np.max(np.abs(output_frames))) * 0.9
@@ -892,10 +982,6 @@ class Synthesizer:
# TODO: Make proper fix for grain index offset of 1
target_times = target_sample.times[target_grain_ind-1]
# Get mean harmonic ratio of f0 frames in time range specified.
target_harmonic_ratio = target_sample.analysis_data_grains(target_times, "harm_ratio", format="mean")[0][0]
# Get mean of f0 frames in time range specified.
target_f0 = target_sample.analysis_data_grains(target_times, "f0", format="median")[0][0]
@@ -903,20 +989,25 @@ class Synthesizer:
# TODO: Make proper fix for grain index offset of 1
source_times = source_sample.times[source_grain_ind-1]
# Get mean harmonic ratio of f0 frames in time range specified.
source_harmonic_ratio = source_sample.analysis_data_grains(source_times, "harm_ratio", format="mean")[0][0]
hr_array = np.array([source_harmonic_ratio, target_harmonic_ratio])
if np.any(np.isnan(hr_array)):
return grain*0
# Get mean of f0 frames in time range specified.
source_f0 = source_sample.analysis_data_grains(source_times, "f0", format="median")[0][0]
f0_array = np.array([source_f0, target_f0])
if np.any(np.isnan(f0_array)):
if not self.config.synthesizer["silence_inharmonics"]:
return grain
else:
return grain*0
ratio_difference = target_f0 / source_f0
if not np.isfinite(ratio_difference):
return grain*0
if not self.config.synthesizer["silence_inharmonics"]:
return grain
else:
return grain*0
# If the ratio difference is within the limits
ratio_limit = self.config.synthesizer["enf_f0_ratio_limit"]
@@ -935,8 +1026,10 @@ class Synthesizer:
ratio_difference = 1./ratio_limit
grain = pitch_shift.shift(grain, ratio_difference)
'''
if ratio_difference > ratio_limit or ratio_difference < 1./ratio_limit:
grain *= 0
'''
return grain
@@ -953,9 +1046,8 @@ class Synthesizer:
# Get mean of RMS frames in time range specified.
target_rms = target_sample.analysis_data_grains(target_times, "rms", format="mean")[0][0]
target_peak = target_sample.analysis_data_grains(target_times, "peak", format="mean")[0][0]
target_intensity_value = np.mean([target_rms, target_peak])
target_intensity_value = target_rms
# Get grain start and finish range to retreive analysis frames from.
# TODO: Make proper fix for grain index offset of 1
@@ -963,9 +1055,8 @@ class Synthesizer:
# Get mean of RMS frames in time range specified.
source_rms = source_sample.analysis_data_grains(source_times, "rms", format="mean")[0][0]
source_peak = source_sample.analysis_data_grains(source_times, "peak", format="mean")[0][0]
source_intensity_value = np.mean([source_rms, source_peak])
source_intensity_value = np.mean(source_rms)
ratio_difference = target_intensity_value / source_intensity_value
Executable → Regular
View File
Executable → Regular
View File
+38 -38
View File
@@ -142,7 +142,7 @@ def main():
example below attempts to mimic the examples provided by mathworks MATLAB
documentation, http://www.mathworks.com/help/toolbox/signal/
"""
import pylab
import matplotlib.pyplot as plt
argv = sys.argv
if len(argv) != 1:
print >>sys.stderr, 'usage: python -m pim.sp.multirate'
@@ -166,25 +166,25 @@ def main():
t = numpy.arange(0, 1, 0.00025)
x = numpy.sin(2*numpy.pi*30*t) + numpy.sin(2*numpy.pi*60*t)
y = decimate(x,4)
pylab.figure()
pylab.subplot(2, 1, 1)
pylab.title('Original Signal')
pylab.stem(numpy.arange(len(x[0:120])), x[0:120])
pylab.subplot(2, 1, 2)
pylab.title('Decimated Signal')
pylab.stem(numpy.arange(len(y[0:30])), y[0:30])
plt.figure()
plt.subplot(2, 1, 1)
plt.title('Original Signal')
plt.stem(numpy.arange(len(x[0:120])), x[0:120])
plt.subplot(2, 1, 2)
plt.title('Decimated Signal')
plt.stem(numpy.arange(len(y[0:30])), y[0:30])
#Interp
t = numpy.arange(0, 1, 0.001)
x = numpy.sin(2*numpy.pi*30*t) + numpy.sin(2*numpy.pi*60*t)
y = interp(x,4)
pylab.figure()
pylab.subplot(2, 1, 1)
pylab.title('Original Signal')
pylab.stem(numpy.arange(len(x[0:30])), x[0:30])
pylab.subplot(2, 1, 2)
pylab.title('Interpolated Signal')
pylab.stem(numpy.arange(len(y[0:120])), y[0:120])
plt.figure()
plt.subplot(2, 1, 1)
plt.title('Original Signal')
plt.stem(numpy.arange(len(x[0:30])), x[0:30])
plt.subplot(2, 1, 2)
plt.title('Interpolated Signal')
plt.stem(numpy.arange(len(y[0:120])), y[0:120])
#upfirdn
L = 147.0
@@ -196,11 +196,11 @@ def main():
n = numpy.arange(0, 10239)
x = numpy.sin(2*numpy.pi*1000/Fs*n)
y = upfirdn(x, h, L, M)
pylab.figure()
pylab.stem(n[1:49]/Fs, x[1:49])
pylab.stem(n[1:45]/(Fs*L/M), y[13:57], 'r', markerfmt='ro',)
pylab.xlabel('Time (sec)')
pylab.ylabel('Signal value')
plt.figure()
plt.stem(n[1:49]/Fs, x[1:49])
plt.stem(n[1:45]/(Fs*L/M), y[13:57], 'r', markerfmt='ro',)
plt.xlabel('Time (sec)')
plt.ylabel('Signal value')
#resample
fs1 = 10.0
@@ -208,30 +208,30 @@ def main():
x = t1
y = resample(x, 3, 2)
t2 = numpy.arange(0,(len(y)))*2.0/(3.0*fs1)
pylab.figure()
pylab.plot(t1, x, '*')
pylab.plot(t2, y, 'o')
pylab.plot(numpy.arange(-0.5,1.5, 0.01), numpy.arange(-0.5,1.5, 0.01), ':')
pylab.legend(('original','resampled'))
pylab.xlabel('Time')
plt.figure()
plt.plot(t1, x, '*')
plt.plot(t2, y, 'o')
plt.plot(numpy.arange(-0.5,1.5, 0.01), numpy.arange(-0.5,1.5, 0.01), ':')
plt.legend(('original','resampled'))
plt.xlabel('Time')
x = numpy.hstack([numpy.arange(1,11), numpy.arange(9,0,-1)])
y = resample(x,3,2)
pylab.figure()
pylab.subplot(2, 1, 1)
pylab.title('Edge Effects Not Noticeable')
pylab.plot(numpy.arange(19)+1, x, '*')
pylab.plot(numpy.arange(29)*2/3.0 + 1, y, 'o')
pylab.legend(('original', 'resampled'))
plt.figure()
plt.subplot(2, 1, 1)
plt.title('Edge Effects Not Noticeable')
plt.plot(numpy.arange(19)+1, x, '*')
plt.plot(numpy.arange(29)*2/3.0 + 1, y, 'o')
plt.legend(('original', 'resampled'))
x = numpy.hstack([numpy.arange(10, 0, -1), numpy.arange(2,11)])
y = resample(x,3,2)
pylab.subplot(2, 1, 2)
pylab.plot(numpy.arange(19)+1, x, '*')
pylab.plot(numpy.arange(29)*2/3.0 + 1, y, 'o')
pylab.title('Edge Effects Very Noticeable')
pylab.legend(('original', 'resampled'))
plt.subplot(2, 1, 2)
plt.plot(numpy.arange(19)+1, x, '*')
plt.plot(numpy.arange(29)*2/3.0 + 1, y, 'o')
plt.title('Edge Effects Very Noticeable')
plt.legend(('original', 'resampled'))
pylab.show()
plt.show()
return 0
if __name__ == '__main__':
+3
View File
@@ -31,3 +31,6 @@ def shift(sigin, pitch):
# Read result
result = shift_output.read_grain()
return result
def shift2():
print "To implement..."
Executable → Regular
View File
+9 -9
View File
@@ -1,11 +1,11 @@
import numpy as np
def gen_wave(
size,
freq,
wave_type,
phase = 0.0,
amplitude = 1.0,
size,
freq,
wave_type,
phase = 0.0,
amplitude = 1.0,
samplerate = 44100
):
"""
@@ -18,10 +18,10 @@ def gen_wave(
def sine():
samples = np.arange(0, size, 1. / samplerate)
return amplitude * np.sin(2.0*np.pi*freq*samples)
def square():
return amplitude * np.sign(sine())
def triangle():
samples = np.arange(0, size, 1. / samplerate)
return amplitude - (2 * np.abs(samples * (2 * freq) % (2*amplitude) - amplitude))
@@ -29,11 +29,11 @@ def gen_wave(
def sawtooth():
samples = np.arange(0, size, 1. / samplerate)
return amplitude - (2 * np.abs((samples * freq) % amplitude - amplitude))
def reverse_saw():
samples = np.arange(0, size, 1. / samplerate)
return amplitude - (2 * np.abs(((samples * freq) % amplitude)))
options = {
"sine" : sine,
"square" : square,
View File
+10
View File
@@ -1,8 +1,18 @@
"""A set of unit tests to check the correct operation of the pysound module."""
import unittest
import numpy as np
import sys
######
# Import sppysound module from directory above current one
from inspect import getsourcefile
import os.path as path, sys
current_dir = path.dirname(path.abspath(getsourcefile(lambda:0)))
sys.path.insert(0, current_dir[:current_dir.rfind(path.sep)])
from sppysound import AudioFile, analysis
from sppysound.database import AudioDatabase, Matcher
######
import subprocess
from scipy import signal
Executable → Regular
View File
Executable → Regular
View File