PyOpenJTalkのコードを見てみる2
コマンドツール版にあってPyOpenJTalk引数にないもの一覧。それらを実装するにはどうしたらいいか。
成果物
情報源
コマンドツール版にあってPyOpenJTalk引数にないもの一覧
open_jtalk
usage: open_jtalk [ options ] [ infile ] options: [ def][ min-- max] -x dir : dictionary directory [ N/A] -m htsvoice : HTS voice files [ N/A] -ow s : filename of output wav audio (generated speech) [ N/A] -ot s : filename of output trace information [ N/A] -s i : sampling frequency [ auto][ 1-- ] -p i : frame period (point) [ auto][ 1-- ] -a f : all-pass constant [ auto][ 0.0-- 1.0] -b f : postfiltering coefficient [ 0.0][ 0.0-- 1.0] -r f : speech speed rate [ 1.0][ 0.0-- ] -fm f : additional half-tone [ 0.0][ -- ] -u f : voiced/unvoiced threshold [ 0.5][ 0.0-- 1.0] -jm f : weight of GV for spectrum [ 1.0][ 0.0-- ] -jf f : weight of GV for log F0 [ 1.0][ 0.0-- ] -g f : volume (dB) [ 0.0][ -- ] -z i : audio buffer size (if i==0, turn off) [ 0][ 0-- ] infile: text file [stdin]
options | comment | 既存 | init | range | API |
---|---|---|---|---|---|
-x dir |
dictionary directory |
○ | - | - | OpenJTalk(dn_mecab=dn_mecab.encode('ascii')) |
-m htsvoice |
HTS voice files |
○ | - | - | HTSEngine(htsvoice.encode('ascii')) |
-ow s |
filename of output wav audio (generated speech) |
☓ | - | - | HTS_Engine_save_generated_speech |
-ot s |
filename of output trace information |
☓ | - | - | HTS_Engine_save_information HTS_Engine_save_label HTS_Engine_save_generated_parameter HTS_Engine_save_riff |
-s i |
sampling frequency |
☓ | auto |
1 〜 |
HTS_Engine_set_sampling_frequency |
-p i |
frame period (point) |
☓ | auto |
1 〜 |
HTS_Engine_set_fperiod |
-a f |
all-pass constant |
? | auto |
0.0 〜1.0 |
対応API不明 |
-b f |
postfiltering coefficient |
? | 0.0 |
0.0 〜1.0 |
対応API不明 |
-r f |
speech speed rate |
○ | 1.0 |
0.0 〜 |
pyopenjtalk. tts(speed=1.0) |
-fm f |
additional half-tone |
○ | 0.0 |
- | pyopenjtalk. tts(half_tone=0.0) |
-u f |
voiced/unvoiced threshold |
☓ | 0.5 |
0.0 〜1.0 |
HTS_Engine_set_msd_threshold |
-jm f |
weight of GV for spectrum |
☓ | 1.0 |
0.0 〜 |
HTS_Engine_set_gv_weight |
-jf f |
weight of GV for log F0 |
☓ | 1.0 |
0.0 〜 |
HTS_Engine_set_gv_interpolation_weight |
-g f |
volume (dB) |
☓ | 0.0 |
- | HTS_Engine_set_volume |
-z i |
audio buffer size (if i==0, turn off) |
☓ | 0 |
0 〜 |
HTS_Engine_set_audio_buff_size |
infile |
text file (default = stdin) |
○ | - | - | pyopenjtalk. tts(text) |
定義
上記にHTS_EngineのAPIを参照するコードが書かれているっぽい。CPythonをもちいてC言語で書かれたHTS_EngineのAPIをPythonで使えるようにしていると思われる。
おそらく他にも多数のAPIがあるはず。それをここで定義すればよいと思われる。
HTS_engine.h
というヘッダファイルからAPIを参照しているっぽい。そのファイル名でググったら一発でコードが見つかった。
HTS_engine.hから使いたいAPIをコピーして、htsengine.init.pxdにペーストする。これで参照できるようになると思う。
PyOpenJTalk側に実装されていなかったAPIを追記したものが以下コードになる。
htsengine.init.pxd
# distutils: language = c++ cdef extern from "HTS_engine.h": cdef cppclass _HTS_Engine: // pass ctypedef _HTS_Engine HTS_Engine void HTS_Engine_initialize(HTS_Engine * engine) char HTS_Engine_load(HTS_Engine * engine, char **voices, size_t num_voices) size_t HTS_Engine_get_sampling_frequency(HTS_Engine * engine) size_t HTS_Engine_get_fperiod(HTS_Engine * engine) void HTS_Engine_refresh(HTS_Engine * engine) void HTS_Engine_clear(HTS_Engine * engine) const char *HTS_Engine_get_fullcontext_label_format(HTS_Engine * engine) char HTS_Engine_synthesize_from_strings(HTS_Engine * engine, char **lines, size_t num_lines) char HTS_Engine_synthesize_from_fn(HTS_Engine * engine, const char *fn) double HTS_Engine_get_generated_speech(HTS_Engine * engine, size_t index) size_t HTS_Engine_get_nsamples(HTS_Engine * engine) void HTS_Engine_set_speed(HTS_Engine * engine, double f) void HTS_Engine_add_half_tone(HTS_Engine * engine, double f) # 追加 # https://github.com/r9y9/hts_engine_API/blob/master/src/include/HTS_engine.h /* HTS_Engine_set_sampling_frequency: set sampling fraquency */ void HTS_Engine_set_sampling_frequency(HTS_Engine * engine, size_t i) /* HTS_Engine_set_fperiod: set frame period */ void HTS_Engine_set_fperiod(HTS_Engine * engine, size_t i) /* HTS_Engine_set_volume: set volume in db */ void HTS_Engine_set_volume(HTS_Engine * engine, double f); /* HTS_Engine_get_volume: get volume in db */ double HTS_Engine_get_volume(HTS_Engine * engine); /* HTS_Egnine_set_msd_threshold: set MSD threshold */ void HTS_Engine_set_msd_threshold(HTS_Engine * engine, size_t stream_index, double f); /* HTS_Engine_get_msd_threshold: get MSD threshold */ double HTS_Engine_get_msd_threshold(HTS_Engine * engine, size_t stream_index); /* HTS_Engine_set_gv_weight: set GV weight */ void HTS_Engine_set_gv_weight(HTS_Engine * engine, size_t stream_index, double f); /* HTS_Engine_get_gv_weight: get GV weight */ double HTS_Engine_get_gv_weight(HTS_Engine * engine, size_t stream_index); /* HTS_Engine_set_audio_buff_size: set audio buffer size */ void HTS_Engine_set_audio_buff_size(HTS_Engine * engine, size_t i); /* HTS_Engine_get_audio_buff_size: get audio buffer size */ size_t HTS_Engine_get_audio_buff_size(HTS_Engine * engine); /* HTS_Engine_set_stop_flag: set stop flag */ void HTS_Engine_set_stop_flag(HTS_Engine * engine, HTS_Boolean b); /* HTS_Engine_get_stop_flag: get stop flag */ HTS_Boolean HTS_Engine_get_stop_flag(HTS_Engine * engine); /* HTS_Engine_set_phoneme_alignment_flag: set flag for using phoneme alignment in label */ void HTS_Engine_set_phoneme_alignment_flag(HTS_Engine * engine, HTS_Boolean b); /* HTS_Engine_set_alpha: set alpha */ void HTS_Engine_set_alpha(HTS_Engine * engine, double f); /* HTS_Engine_get_alpha: get alpha */ double HTS_Engine_get_alpha(HTS_Engine * engine); /* HTS_Engine_set_beta: set beta */ void HTS_Engine_set_beta(HTS_Engine * engine, double f); /* HTS_Engine_get_beta: get beta */ double HTS_Engine_get_beta(HTS_Engine * engine); /* HTS_Engine_set_duration_interpolation_weight: set interpolation weight for duration */ void HTS_Engine_set_duration_interpolation_weight(HTS_Engine * engine, size_t voice_index, double f); /* HTS_Engine_get_duration_interpolation_weight: get interpolation weight for duration */ double HTS_Engine_get_duration_interpolation_weight(HTS_Engine * engine, size_t voice_index); /* HTS_Engine_set_parameter_interpolation_weight: set interpolation weight for parameter */ void HTS_Engine_set_parameter_interpolation_weight(HTS_Engine * engine, size_t voice_index, size_t stream_index, double f); /* HTS_Engine_get_parameter_interpolation_weight: get interpolation weight for parameter */ double HTS_Engine_get_parameter_interpolation_weight(HTS_Engine * engine, size_t voice_index, size_t stream_index); /* HTS_Engine_set_gv_interpolation_weight: set interpolation weight for GV */ void HTS_Engine_set_gv_interpolation_weight(HTS_Engine * engine, size_t voice_index, size_t stream_index, double f); /* HTS_Engine_get_gv_interpolation_weight: get interpolation weight for GV */ double HTS_Engine_get_gv_interpolation_weight(HTS_Engine * engine, size_t voice_index, size_t stream_index); /* HTS_Engine_get_total_state: get total number of state */ size_t HTS_Engine_get_total_state(HTS_Engine * engine); /* HTS_Engine_set_state_mean: set mean value of state */ void HTS_Engine_set_state_mean(HTS_Engine * engine, size_t stream_index, size_t state_index, size_t vector_index, double f); /* HTS_Engine_get_state_mean: get mean value of state */ double HTS_Engine_get_state_mean(HTS_Engine * engine, size_t stream_index, size_t state_index, size_t vector_index); /* HTS_Engine_get_state_duration: get state duration */ size_t HTS_Engine_get_state_duration(HTS_Engine * engine, size_t state_index); /* HTS_Engine_get_nvoices: get number of voices */ size_t HTS_Engine_get_nvoices(HTS_Engine * engine); /* HTS_Engine_get_nstream: get number of stream */ size_t HTS_Engine_get_nstream(HTS_Engine * engine); /* HTS_Engine_get_nstate: get number of state */ size_t HTS_Engine_get_nstate(HTS_Engine * engine); /* HTS_Engine_get_fullcontext_label_version: get full context label version */ const char *HTS_Engine_get_fullcontext_label_version(HTS_Engine * engine); /* HTS_Engine_get_total_frame: get total number of frame */ size_t HTS_Engine_get_total_frame(HTS_Engine * engine); /* HTS_Engine_get_generated_parameter: output generated parameter */ double HTS_Engine_get_generated_parameter(HTS_Engine * engine, size_t stream_index, size_t frame_index, size_t vector_index); /* HTS_Engine_generate_state_sequence_from_fn: generate state sequence from file name (1st synthesis step) */ HTS_Boolean HTS_Engine_generate_state_sequence_from_fn(HTS_Engine * engine, const char *fn); /* HTS_Engine_generate_state_sequence_from_strings: generate state sequence from string list (1st synthesis step) */ HTS_Boolean HTS_Engine_generate_state_sequence_from_strings(HTS_Engine * engine, char **lines, size_t num_lines); /* HTS_Engine_generate_parameter_sequence: generate parameter sequence (2nd synthesis step) */ HTS_Boolean HTS_Engine_generate_parameter_sequence(HTS_Engine * engine); /* HTS_Engine_generate_sample_sequence: generate sample sequence (3rd synthesis step) */ HTS_Boolean HTS_Engine_generate_sample_sequence(HTS_Engine * engine); /* HTS_Engine_save_information: save trace information */ void HTS_Engine_save_information(HTS_Engine * engine, FILE * fp); /* HTS_Engine_save_label: save label with time */ void HTS_Engine_save_label(HTS_Engine * engine, FILE * fp); /* HTS_Engine_save_generated_parameter: save generated parameter */ void HTS_Engine_save_generated_parameter(HTS_Engine * engine, size_t stream_index, FILE * fp); /* HTS_Engine_save_generated_speech: save generated speech */ void HTS_Engine_save_generated_speech(HTS_Engine * engine, FILE * fp); /* HTS_Engine_save_riff: save RIFF format file */ void HTS_Engine_save_riff(HTS_Engine * engine, FILE * fp);
htsengine.pyx
定義したAPIを利用する。
以下のようにhtsengine.init.pxdで定義したAPIを、htsengine.pyxでcimport
する。
from htsengine cimport (
HTS_Engine_initialize, HTS_Engine_load, HTS_Engine_clear, HTS_Engine_refresh,
HTS_Engine_get_sampling_frequency, HTS_Engine_get_fperiod,
HTS_Engine_set_speed, HTS_Engine_add_half_tone,
HTS_Engine_synthesize_from_strings,
HTS_Engine_get_generated_speech, HTS_Engine_get_nsamples
)
追加したコードを追記すると以下。
from htsengine cimport (
HTS_Engine_initialize, HTS_Engine_load, HTS_Engine_clear, HTS_Engine_refresh,
HTS_Engine_get_sampling_frequency, HTS_Engine_get_fperiod,
HTS_Engine_set_speed, HTS_Engine_add_half_tone,
HTS_Engine_synthesize_from_strings,
HTS_Engine_get_generated_speech, HTS_Engine_get_nsamples,
HTS_Engine_set_sampling_frequency,
HTS_Engine_set_fperiod,
HTS_Engine_set_volume,
HTS_Engine_get_volume,
HTS_Engine_set_msd_threshold,
HTS_Engine_get_msd_threshold,
HTS_Engine_set_gv_weight,
HTS_Engine_get_gv_weight,
HTS_Engine_set_audio_buff_size,
HTS_Engine_get_audio_buff_size,
HTS_Engine_set_stop_flag,
HTS_Engine_get_stop_flag,
HTS_Engine_set_phoneme_alignment_flag,
HTS_Engine_set_alpha,
HTS_Engine_get_alpha,
HTS_Engine_set_beta,
HTS_Engine_get_beta,
HTS_Engine_set_duration_interpolation_weight,
HTS_Engine_get_duration_interpolation_weight,
HTS_Engine_set_parameter_interpolation_weight,
HTS_Engine_get_parameter_interpolation_weight,
HTS_Engine_set_gv_interpolation_weight,
HTS_Engine_get_gv_interpolation_weight,
HTS_Engine_get_total_state,
HTS_Engine_set_state_mean,
HTS_Engine_get_state_mean,
HTS_Engine_get_state_duration,
HTS_Engine_get_nvoices,
HTS_Engine_get_nstream,
HTS_Engine_get_nstate,
HTS_Engine_get_fullcontext_label_version,
HTS_Engine_get_total_frame,
HTS_Engine_get_generated_parameter,
HTS_Engine_generate_state_sequence_from_fn,
HTS_Engine_generate_state_sequence_from_strings,
HTS_Engine_generate_parameter_sequence,
HTS_Engine_generate_sample_sequence,
HTS_Engine_save_information,
HTS_Engine_save_label,
HTS_Engine_save_generated_parameter,
HTS_Engine_save_generated_speech,
HTS_Engine_save_riff
)
API名を抽出して末尾にカンマをつければいい。面倒なので以下コマンドで加工した。
cat hts.txt | grep -oP 'HTS_Engine_[_a-zA-Z]+' | sed -e 's/$/,/'
これでhtsengine.pyx内でHTS_Engineの全APIが使えるようになった。
あとはこれをPythonの文脈でラップするだけ。たとえば以下のように。これを追加した全APIにほどこす。
def set_speed(self, speed=1.0): HTS_Engine_set_speed(self.engine, speed)
追加メソッド
htsengine.pyxのcdef class HTSEngine
内に以下を追記する。
# 追加API ---- start def set_sampling_frequency(size_t i): HTS_Engine_set_sampling_frequency(self.engine, i) def set_fperiod(self, size_t i): HTS_Engine_set_fperiod(self.engine, i) def set_volume(self, double f): HTS_Engine_set_volume(self.engine, f) def get_volume(self): return HTS_Engine_get_volume(self.engine) # double def set_msd_threshold(self, size_t stream_index, double f): HTS_Engine_set_msd_threshold(self.engine, stream_index, f) def get_msd_threshold(self, size_t stream_index): return HTS_Engine_get_msd_threshold(self.engine, stream_index) # double def set_gv_weight(self, size_t stream_index, double f): HTS_Engine_set_gv_weight(self.engine, stream_index, f) def get_gv_weight(self, size_t stream_index): return HTS_Engine_get_gv_weight(self.engine, stream_index) # double def set_audio_buff_size(self, size_t i): HTS_Engine_set_audio_buff_size(self.engine, i) def get_audio_buff_size(self): return HTS_Engine_get_audio_buff_size(self.engine) # size_t def set_stop_flag(self, HTS_Boolean b): HTS_Engine_set_stop_flag(self.engine, b) def get_stop_flag(self): HTS_Engine_get_stop_flag(self.engine) # HTS_Boolean def set_phoneme_alignment_flag(self, HTS_Boolean b): HTS_Engine_set_phoneme_alignment_flag(self.engine, b) def set_alpha(self, double f): HTS_Engine_set_alpha(self.engine, f) def get_alpha(self): return HTS_Engine_get_alpha(self.engine) # double def set_beta(self, double f): HTS_Engine_set_beta(self.engine, f) def get_beta(self): return HTS_Engine_get_beta(self.engine) # double def set_duration_interpolation_weight(self, size_t voice_index, double f): HTS_Engine_set_duration_interpolation_weight(self.engine, voice_index, f) def get_duration_interpolation_weight(self, size_t voice_index): return HTS_Engine_get_duration_interpolation_weight(self.engine, voice_index) # double def set_parameter_interpolation_weight(self, size_t voice_index, size_t stream_index, double f): HTS_Engine_set_parameter_interpolation_weight(self.engine, voice_index, stream_index, f) def get_parameter_interpolation_weight(self, size_t voice_index, size_t stream_index): return HTS_Engine_get_parameter_interpolation_weight(self.engine, voice_index, stream_index) # double def set_gv_interpolation_weight(self, size_t voice_index, size_t stream_index, double f): HTS_Engine_set_gv_interpolation_weight(self.engine, voice_index, stream_index, f) def get_gv_interpolation_weight(self, size_t voice_index, size_t stream_index): return HTS_Engine_get_gv_interpolation_weight(self.engine, voice_index, stream_index) # double def get_total_state(self): return HTS_Engine_get_total_state(self.engine) # size_t def set_state_mean(self, size_t stream_index, size_t state_index, size_t vector_index, double f): HTS_Engine_set_state_mean(self.engine, stream_index, state_index, vector_index, f) def get_state_mean(self, size_t stream_index, size_t state_index, size_t vector_index): return HTS_Engine_get_state_mean(self.engine, stream_index, state_index, vector_index) # double def get_state_duration(self, size_t state_index): HTS_Engine_get_state_duration(self.engine, state_index) # size_t def get_nvoices(self): return HTS_Engine_get_nvoices(self.engine) # size_t def get_nstream(self): return HTS_Engine_get_nstream(self.engine) # size_t def get_nstate(self): return HTS_Engine_get_nstate(self.engine) # size_t def get_fullcontext_label_version(self): return HTS_Engine_get_fullcontext_label_version(self.engine) # const char * def get_total_frame(self): return HTS_Engine_get_total_frame(self.engine) # size_t def get_generated_parameter(self, size_t stream_index, size_t frame_index, size_t vector_index): HTS_Engine_get_generated_parameter(self.engine, stream_index, frame_index, vector_index) # double def generate_state_sequence_from_fn(self, const char *fn): return HTS_Engine_generate_state_sequence_from_fn(self.engine, fn) # HTS_Boolean def generate_state_sequence_from_strings(self, char **lines, size_t num_lines): return HTS_Engine_generate_state_sequence_from_strings(self.engine, lines, num_lines) # HTS_Boolean def generate_parameter_sequence(self): return HTS_Engine_generate_parameter_sequence(self.engine) # HTS_Boolean def generate_sample_sequence(self): return HTS_Engine_generate_sample_sequence(self.engine) # HTS_Boolean def save_information(self, FILE * fp): HTS_Engine_save_information(self.engine, fp) def save_label(self, FILE * fp): HTS_Engine_save_label(self.engine, fp) def save_generated_parameter(self, size_t stream_index, FILE * fp): HTS_Engine_save_generated_parameter(self.engine, stream_index, fp) def save_generated_speech(self, FILE * fp): HTS_Engine_save_generated_speech(self.engine, fp) def save_riff(self, FILE * fp): HTS_Engine_save_riff(self.engine, fp) # 追加API ---- end
ttsのオプション引数に追加する
エンドユーザが使うAPIはttsである。このオプション引数として、上記APIのいくつかを使用するように変更する。
コマンドツール版にあって、PyOpenJTalkにないものを追加する。それは以下である。
options | comment | 既存 | init | range | API |
---|---|---|---|---|---|
-ow s |
filename of output wav audio (generated speech) |
☓ | - | - | HTS_Engine_save_generated_speech |
-ot s |
filename of output trace information |
☓ | - | - | HTS_Engine_save_information HTS_Engine_save_label HTS_Engine_save_generated_parameter HTS_Engine_save_riff |
-s i |
sampling frequency |
☓ | auto |
1 〜 |
HTS_Engine_set_sampling_frequency |
-p i |
frame period (point) |
☓ | auto |
1 〜 |
HTS_Engine_set_fperiod |
-u f |
voiced/unvoiced threshold |
☓ | 0.5 |
0.0 〜1.0 |
HTS_Engine_set_msd_threshold |
-jm f |
weight of GV for spectrum |
☓ | 1.0 |
0.0 〜 |
HTS_Engine_set_gv_weight |
-jf f |
weight of GV for log F0 |
☓ | 1.0 |
0.0 〜 |
HTS_Engine_set_gv_interpolation_weight |
-g f |
volume (dB) |
☓ | 0.0 |
- | HTS_Engine_set_volume |
-z i |
audio buffer size (if i==0, turn off) |
☓ | 0 |
0 〜 |
HTS_Engine_set_audio_buff_size |
なお、以下については対応APIがわからなかったので未対応である。
options | comment | 既存 | init | range | API |
---|---|---|---|---|---|
-a f |
all-pass constant |
? | auto |
0.0 〜1.0 |
対応API不明 |
-b f |
postfiltering coefficient |
? | 0.0 |
0.0 〜1.0 |
対応API不明 |
また、以下については最初から実装済みである。
options | comment | 既存 | init | range | API |
---|---|---|---|---|---|
-x dir |
dictionary directory |
○ | - | - | OpenJTalk(dn_mecab=dn_mecab.encode('ascii')) |
-m htsvoice |
HTS voice files |
○ | - | - | HTSEngine(htsvoice.encode('ascii')) |
-r f |
speech speed rate |
○ | 1.0 |
0.0 〜 |
pyopenjtalk. tts(speed=1.0) |
-fm f |
additional half-tone |
○ | 0.0 |
- | pyopenjtalk. tts(half_tone=0.0) |
infile |
text file (default = stdin) |
○ | - | - | pyopenjtalk. tts(text) |
コードをみてみる。tts, synthesizeは以下のとおり。
def tts(text, speed=1.0, half_tone=0.0): """Text-to-speech Args: text (str): Input text speed (float): speech speed rate. Default is 1.0. half_tone (float): additional half-tone. Default is 0. Returns: np.ndarray: speech waveform (dtype: np.float64) int: sampling frequency (defualt: 48000) """ return synthesize(extract_fullcontext(text), speed, half_tone)
def synthesize(labels, speed=1.0, half_tone=0.0): """Run OpenJTalk's speech synthesis backend Args: labels (list): Full-context labels speed (float): speech speed rate. Default is 1.0. half_tone (float): additional half-tone. Default is 0. Returns: np.ndarray: speech waveform (dtype: np.float64) int: sampling frequency (defualt: 48000) """ if isinstance(labels, tuple) and len(labels) == 2: labels = labels[1] global _global_htsengine if _global_htsengine is None: _global_htsengine = HTSEngine(DEFAULT_HTS_VOICE) sr = _global_htsengine.get_sampling_frequency() _global_htsengine.set_speed(speed) _global_htsengine.add_half_tone(half_tone) return _global_htsengine.synthesize(labels), sr
これにパラメータを追記する。
def tts(text, speed=1.0, half_tone=0.0, sampling_frequency=-1, frame_period=-1, threshold=0.5, weight=1.0, weight_f0=1.0, volume=0.0, buffer_size=0.0, file_name=None, info_file_name=None, label_file_name=None, param_file_name=None, riff_file_name=None ): return synthesize(extract_fullcontext(text), speed, half_tone, sampling_frequency=sampling_frequency, frame_period=frame_period, threshold=threshold, weight=weight, weight_f0=weight_f0, volume=volume, buffer_size=buffer_size, file_name=file_name, info_file_name=info_file_name, label_file_name=label_file_name, param_file_name=param_file_name, riff_file_name=riff_file_name)
def synthesize(labels, speed=1.0, half_tone=0.0, sampling_frequency=-1, frame_period=-1, threshold=0.5, weight=1.0, weight_f0=1.0, volume=0.0, buffer_size=0.0, file_name=None, info_file_name=None, label_file_name=None, param_file_name=None, riff_file_name=None ): """Run OpenJTalk's speech synthesis backend Args: labels (list): Full-context labels speed (float): speech speed rate. Default is 1.0. half_tone (float): additional half-tone. Default is 0. Returns: np.ndarray: speech waveform (dtype: np.float64) int: sampling frequency (defualt: 48000) """ if isinstance(labels, tuple) and len(labels) == 2: labels = labels[1] global _global_htsengine if _global_htsengine is None: _global_htsengine = HTSEngine(DEFAULT_HTS_VOICE) sr = _global_htsengine.get_sampling_frequency() _global_htsengine.set_speed(speed) _global_htsengine.add_half_tone(half_tone) if 1 <= sampling_frequency: _global_htsengine.set_sampling_frequency(sampling_frequency) if 1 <= frame_period: _global_htsengine.set_fperiod(frame_period) if 0.0 <= threshold <= 1.0: _global_htsengine.set_msd_threshold(0, threshold) # stream_index if 0.0 <= weight: _global_htsengine.set_gv_weight(0, weight) # stream_index if 0.0 <= weight_f0: _global_htsengine.set_gv_interpolation_weight(0,0,weight_f0) # voice_index, stream_index if 0.0 <= volume: _global_htsengine.set_volume(volume) if 0.0 <= buffer_size: _global_htsengine.set_audio_buff_size(buffer_size) file_name=None, info_file_name=None, label_file_name=None, param_file_name=None, riff_file_name=None """ if file_name is not None: with open(file_name, 'rb') as f: _global_htsengine.save_generated_speech(f) if info_file_name is not None: with open(info_file_name, 'rb') as f: _global_htsengine.save_information(f) if label_file_name is not None: with open(label_file_name, 'rb') as f: _global_htsengine.save_label(f) if param_file_name is not None: with open(param_file_name, 'rb') as f: _global_htsengine.save_generated_parameter(0, f) # stream_index if riff_file_name is not None: with open(riff_file_name, 'rb') as f: _global_htsengine.save_riff(f) """ return _global_htsengine.synthesize(labels), sr
def save_information(self, FILE * fp): HTS_Engine_save_information(self.engine, fp)
def save_label(self, FILE * fp): HTS_Engine_save_label(self.engine, fp)
def save_generated_parameter(self, size_t stream_index, FILE * fp): HTS_Engine_save_generated_parameter(self.engine, stream_index, fp)
def save_generated_speech(self, FILE * fp): HTS_Engine_save_generated_speech(self.engine, fp)
def save_riff(self, FILE * fp): HTS_Engine_save_riff(self.engine, fp)
def set_audio_buff_size(self, size_t i): HTS_Engine_set_audio_buff_size(self.engine, i)
def get_audio_buff_size(self): return HTS_Engine_get_audio_buff_size(self.engine) # size_t
def set_stop_flag(self, HTS_Boolean b): HTS_Engine_set_stop_flag(self.engine, b)
def get_stop_flag(self): HTS_Engine_get_stop_flag(self.engine) # HTS_Boolean
def set_phoneme_alignment_flag(self, HTS_Boolean b): HTS_Engine_set_phoneme_alignment_flag(self.engine, b)
def set_alpha(self, double f): HTS_Engine_set_alpha(self.engine, f)
def get_alpha(self): return HTS_Engine_get_alpha(self.engine) # double
def set_beta(self, double f): HTS_Engine_set_beta(self.engine, f)
def get_beta(self): return HTS_Engine_get_beta(self.engine) # double
def set_duration_interpolation_weight(self, size_t voice_index, double f): HTS_Engine_set_duration_interpolation_weight(self.engine, voice_index, f)
def get_duration_interpolation_weight(self, size_t voice_index): return HTS_Engine_get_duration_interpolation_weight(self.engine, voice_index) # double
def set_parameter_interpolation_weight(self, size_t voice_index, size_t stream_index, double f): HTS_Engine_set_parameter_interpolation_weight(self.engine, voice_index, stream_index, f)
def get_parameter_interpolation_weight(self, size_t voice_index, size_t stream_index): return HTS_Engine_get_parameter_interpolation_weight(self.engine, voice_index, stream_index) # double
def get_total_state(self): return HTS_Engine_get_total_state(self.engine) # size_t
def set_state_mean(self, size_t stream_index, size_t state_index, size_t vector_index, double f): HTS_Engine_set_state_mean(self.engine, stream_index, state_index, vector_index, f)
def get_state_mean(self, size_t stream_index, size_t state_index, size_t vector_index): return HTS_Engine_get_state_mean(self.engine, stream_index, state_index, vector_index) # double
def get_state_duration(self, size_t state_index): HTS_Engine_get_state_duration(self.engine, state_index) # size_t
def get_nvoices(self): return HTS_Engine_get_nvoices(self.engine) # size_t
def get_nstream(self): return HTS_Engine_get_nstream(self.engine) # size_t
def get_nstate(self): return HTS_Engine_get_nstate(self.engine) # size_t
def get_fullcontext_label_version(self): return HTS_Engine_get_fullcontext_label_version(self.engine) # const char *
def get_total_frame(self): return HTS_Engine_get_total_frame(self.engine) # size_t
def get_generated_parameter(self, size_t stream_index, size_t frame_index, size_t vector_index): HTS_Engine_get_generated_parameter(self.engine, stream_index, frame_index, vector_index) # double
def generate_state_sequence_from_fn(self, const char *fn): return HTS_Engine_generate_state_sequence_from_fn(self.engine, fn) # HTS_Boolean
def generate_state_sequence_from_strings(self, char **lines, size_t num_lines): return HTS_Engine_generate_state_sequence_from_strings(self.engine, lines, num_lines) # HTS_Boolean
def generate_parameter_sequence(self): return HTS_Engine_generate_parameter_sequence(self.engine) # HTS_Boolean
def generate_sample_sequence(self): return HTS_Engine_generate_sample_sequence(self.engine) # HTS_Boolean
-ow s
|filename of output wav audio (generated speech)
|☓|-|-|HTS_Engine_save_generated_speech
-ot s
|filename of output trace information
|☓|-|-|HTS_Engine_save_information
HTS_Engine_save_label
HTS_Engine_save_generated_parameter
HTS_Engine_save_riff
-s i
|sampling frequency
|☓|auto
|1
〜|HTS_Engine_set_sampling_frequency
-p i
|frame period (point)
|☓|auto
|1
〜|HTS_Engine_set_fperiod
-u f
|voiced/unvoiced threshold
|☓|0.5
|0.0
〜1.0
|HTS_Engine_set_msd_threshold
-jm f
|weight of GV for spectrum
|☓|1.0
|0.0
〜|HTS_Engine_set_gv_weight
-jf f
|weight of GV for log F0
|☓|1.0
|0.0
〜|HTS_Engine_set_gv_interpolation_weight
-g f
|volume (dB)
|☓|0.0
|-|HTS_Engine_set_volume
-z i
|audio buffer size (if i==0, turn off)
|☓|0
|0
〜|HTS_Engine_set_audio_buff_size
どうやってビルドすればいいの?
コードを修正してみたはいいが、ビルドする方法がわからなかった。
ローカルでビルドして、ローカルの仮想環境にインストールしてみたかった。でも、どうやればいいかさっぱりわからん。setup.pyをみてみたし、動かしてみたけど、C++のコードがないって怒られた。
Traceback (most recent call last): File "/tmp/work/test/pyopenjtalk/setup.py", line 71, in <module> raise RuntimeError("Cython is required to generate C++ code") RuntimeError: Cython is required to generate C++ code
たぶんHTS_EngineのC言語コードが必要なのだろう。
具体的にどうしたらいいんだろう。もう疲れたからここまで。
所感
そもそもpxd, pyxの拡張子や、その書き方、ビルド方法などを知らない。まわりのコードにあわせてそれっぽく書いてるだけ。そんなんでビルドできるのだろうか。
対象環境
- Raspbierry pi 4 Model B
- Raspberry Pi OS buster 10.0 2020-08-20 ※
- bash 5.0.3(1)-release
$ uname -a Linux raspberrypi 5.4.83-v7l+ #1379 SMP Mon Dec 14 13:11:54 GMT 2020 armv7l GNU/Linux