1414import multiprocessing
1515
1616logger = logging .getLogger (__name__ )
17-
17+ stream_latency = - 1
1818
1919class Harvest (multiprocessing .Process ):
2020 def __init__ (self , inp_q , opt_q ):
@@ -100,7 +100,8 @@ class GUI:
100100 def __init__ (self ) -> None :
101101 self .config = GUIConfig ()
102102 self .flag_vc = False
103-
103+ self .function = 'vc'
104+ self .delay_time = 0
104105 self .launcher ()
105106
106107 def load (self ):
@@ -112,6 +113,10 @@ def load(self):
112113 data ["harvest" ] = data ["f0method" ] == "harvest"
113114 data ["crepe" ] = data ["f0method" ] == "crepe"
114115 data ["rmvpe" ] = data ["f0method" ] == "rmvpe"
116+ if data ["sg_input_device" ] not in input_devices :
117+ data ["sg_input_device" ] = input_devices [sd .default .device [0 ]]
118+ if data ["sg_output_device" ] not in output_devices :
119+ data ["sg_output_device" ] = output_devices [sd .default .device [1 ]]
115120 except :
116121 with open ("configs/config.json" , "w" ) as j :
117122 data = {
@@ -342,6 +347,22 @@ def launcher(self):
342347 [
343348 sg .Button (i18n ("开始音频转换" ), key = "start_vc" ),
344349 sg .Button (i18n ("停止音频转换" ), key = "stop_vc" ),
350+ sg .Radio (
351+ i18n ("输入监听" ),
352+ "function" ,
353+ key = "im" ,
354+ default = False ,
355+ enable_events = True ,
356+ ),
357+ sg .Radio (
358+ i18n ("输出变声" ),
359+ "function" ,
360+ key = "vc" ,
361+ default = True ,
362+ enable_events = True ,
363+ ),
364+ sg .Text (i18n ("算法延迟(ms):" )),
365+ sg .Text ("0" , key = "delay_time" ),
345366 sg .Text (i18n ("推理时间(ms):" )),
346367 sg .Text ("0" , key = "infer_time" ),
347368 ],
@@ -403,9 +424,16 @@ def event_handler(self):
403424 }
404425 with open ("configs/config.json" , "w" ) as j :
405426 json .dump (settings , j )
427+ global stream_latency
428+ while stream_latency < 0 :
429+ time .sleep (0.01 )
430+ self .delay_time = stream_latency + values ["block_time" ] + values ["crossfade_length" ] + 0.01
431+ if values ["I_noise_reduce" ]:
432+ self .delay_time += values ["crossfade_length" ]
433+ self .window ["delay_time" ].update (int (self .delay_time * 1000 ))
406434 if event == "stop_vc" and self .flag_vc == True :
407435 self .flag_vc = False
408-
436+ stream_latency = - 1
409437 # Parameter hot update
410438 if event == "threhold" :
411439 self .config .threhold = values ["threhold" ]
@@ -423,11 +451,17 @@ def event_handler(self):
423451 self .config .f0method = event
424452 elif event == "I_noise_reduce" :
425453 self .config .I_noise_reduce = values ["I_noise_reduce" ]
454+ if stream_latency > 0 :
455+ self .delay_time += (1 if values ["I_noise_reduce" ] else - 1 ) * values ["crossfade_length" ]
456+ self .window ["delay_time" ].update (int (self .delay_time * 1000 ))
426457 elif event == "O_noise_reduce" :
427458 self .config .O_noise_reduce = values ["O_noise_reduce" ]
459+ elif event in ["vc" , "im" ]:
460+ self .function = event
428461 elif event != "start_vc" and self .flag_vc == True :
429462 # Other parameters do not support hot update
430463 self .flag_vc = False
464+ stream_latency = - 1
431465
432466 def set_values (self , values ):
433467 if len (values ["pth_path" ].strip ()) == 0 :
@@ -565,7 +599,9 @@ def soundinput(self):
565599 blocksize = self .block_frame ,
566600 samplerate = self .config .samplerate ,
567601 dtype = "float32" ,
568- ):
602+ ) as stream :
603+ global stream_latency
604+ stream_latency = stream .latency [- 1 ]
569605 while self .flag_vc :
570606 time .sleep (self .config .block_time )
571607 logger .debug ("Audio block passed." )
@@ -597,7 +633,7 @@ def audio_callback(
597633 self .block_frame_16k :
598634 ].clone ()
599635 # input noise reduction and resampling
600- if self .config .I_noise_reduce :
636+ if self .config .I_noise_reduce and self . function == 'vc' :
601637 input_wav = self .input_wav [
602638 - self .crossfade_frame - self .block_frame - 2 * self .zc :
603639 ]
@@ -621,23 +657,28 @@ def audio_callback(
621657 self .input_wav [- self .block_frame - 2 * self .zc :]
622658 )[160 :]
623659 # infer
624- f0_extractor_frame = self .block_frame_16k + 800
625- if self .config .f0method == "rmvpe" :
626- f0_extractor_frame = 5120 * ((f0_extractor_frame - 1 ) // 5120 + 1 ) - 160
627- infer_wav = self .rvc .infer (
628- self .input_wav_res ,
629- self .input_wav_res [- f0_extractor_frame :].cpu ().numpy (),
630- self .block_frame_16k ,
631- self .valid_rate ,
632- self .pitch ,
633- self .pitchf ,
634- self .config .f0method ,
635- )
636- infer_wav = infer_wav [
637- - self .crossfade_frame - self .sola_search_frame - self .block_frame :
638- ]
660+ if self .function == 'vc' :
661+ f0_extractor_frame = self .block_frame_16k + 800
662+ if self .config .f0method == "rmvpe" :
663+ f0_extractor_frame = 5120 * ((f0_extractor_frame - 1 ) // 5120 + 1 ) - 160
664+ infer_wav = self .rvc .infer (
665+ self .input_wav_res ,
666+ self .input_wav_res [- f0_extractor_frame :].cpu ().numpy (),
667+ self .block_frame_16k ,
668+ self .valid_rate ,
669+ self .pitch ,
670+ self .pitchf ,
671+ self .config .f0method ,
672+ )
673+ infer_wav = infer_wav [
674+ - self .crossfade_frame - self .sola_search_frame - self .block_frame :
675+ ]
676+ else :
677+ infer_wav = self .input_wav [
678+ - self .crossfade_frame - self .sola_search_frame - self .block_frame :
679+ ].clone ()
639680 # output noise reduction
640- if self .config .O_noise_reduce :
681+ if ( self .config .O_noise_reduce and self . function == 'vc' ) or ( self . config . I_noise_reduce and self . function == 'im' ) :
641682 self .output_buffer [: - self .block_frame ] = self .output_buffer [
642683 self .block_frame :
643684 ].clone ()
@@ -646,7 +687,7 @@ def audio_callback(
646687 infer_wav .unsqueeze (0 ), self .output_buffer .unsqueeze (0 )
647688 ).squeeze (0 )
648689 # volume envelop mixing
649- if self .config .rms_mix_rate < 1 :
690+ if self .config .rms_mix_rate < 1 and self . function == 'vc' :
650691 rms1 = librosa .feature .rms (
651692 y = self .input_wav_res [- 160 * infer_wav .shape [0 ] // self .zc :]
652693 .cpu ()
0 commit comments