## tutorial from: https://mtg.github.io/essentia-labs/news/2019/09/05/cover-song-similarity/ ################# # standard part # ################# import essentia.standard as estd from essentia.pytools.spectral import hpcpgram yesterday_original = 'audio/Yesterday (Remastered 2009).mp3' yesterday_cover_01 = 'audio/Yesterday - The Beatles - Connie Talbot (Cover).mp3' wrong_song = 'audio/Bella Poarch - Build a Btch (Official Music Video).mp3' song_reference = yesterday_original # query cover song original_song = estd.MonoLoader(filename=song_reference, sampleRate=32000)() ## Now let’s compute Harmonic Pitch Class Profile (HPCP) chroma features of these audio signals. true_cover_hpcp = hpcpgram(original_song, sampleRate=32000) ################# # Straming part # ################# import essentia.streaming as estr from essentia import array, run, Pool query_filename = wrong_song # Let's instantiate all the required essentia streaming algorithms audio = estr.MonoLoader(filename=query_filename, sampleRate=32000) frame_cutter = estr.FrameCutter(frameSize=4096, hopSize=2048) windowing = estr.Windowing(type="blackmanharris62") spectrum = estr.Spectrum(); peak = estr.SpectralPeaks(sampleRate=32000) whitening = estr.SpectralWhitening(maxFrequency=3500, sampleRate=32000); hpcp = estr.HPCP(sampleRate=32000, minFrequency=100, maxFrequency=3500, size=12); # Create an instance of streaming ChromaCrossSimilarity algorithm # With parameter `referenceFeature`, # we can pass the pre-computed reference song chroma features. # In this case, we use the pre-computed HPCP feature # of the 'true_cover_song'. # With parameter `oti`, we can tranpose the pitch # of the reference song HPCP feature # to an given OTI [5] (if it's known before hand). # By default we set `oti=0` sim_matrix = estr.ChromaCrossSimilarity( referenceFeature=true_cover_hpcp, oti=0) # Create an instance of the cover song similarity alignment algorithm # 'pipeDistance=True' stdout distance values for each input stream alignment = estr.CoverSongSimilarity(pipeDistance=True) # essentia Pool instance (python dict like object) to aggregrate the outputs pool = Pool() # Connect all the required algorithms in a essentia streaming network # ie., connecting inputs and outputs of the algorithms # in the required workflow and order audio.audio >> frame_cutter.signal frame_cutter.frame >> windowing.frame windowing.frame >> spectrum.frame spectrum.spectrum >> peak.spectrum spectrum.spectrum >> whitening.spectrum peak.magnitudes >> whitening.magnitudes peak.frequencies >> whitening.frequencies peak.frequencies >> hpcp.frequencies whitening.magnitudes >> hpcp.magnitudes hpcp.hpcp >> sim_matrix.queryFeature sim_matrix.csm >> alignment.inputArray alignment.scoreMatrix >> (pool, 'scoreMatrix') alignment.distance >> (pool, 'distance') # Run the algorithm network run(audio) # This process will stdout the cover song similarity distance # for every input stream in realtime. # It also aggregrates the Smith-Waterman alignment score matrix # and cover song similarity distance for every accumulating # input audio stream in an essentia pool instance (similar to a python dict) # which can be accessed after the end of the stream. # Now, let's check the final cover song similarity distance value # computed at the last input stream. print(pool['distance'][-1])