> vq_model
SynthesizerTrn(
(enc_p): TextEncoder(
(ssl_proj): Conv1d(768, 192, kernel_size=(1,), stride=(1,))
(encoder_ssl): Encoder(
(drop): Dropout(p=0.1, inplace=False)
(attn_layers): ModuleList(
(0-2): 3 x MultiHeadAttention(
(conv_q): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(conv_k): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(conv_v): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(conv_o): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(drop): Dropout(p=0.1, inplace=False)
)
)
(norm_layers_1): ModuleList(
(0-2): 3 x LayerNorm()
)
(ffn_layers): ModuleList(
(0-2): 3 x FFN(
(conv_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,))
(conv_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,))
(drop): Dropout(p=0.1, inplace=False)
)
)
(norm_layers_2): ModuleList(
(0-2): 3 x LayerNorm()
)
)
(encoder_text): Encoder(
(drop): Dropout(p=0.1, inplace=False)
(attn_layers): ModuleList(
(0-5): 6 x MultiHeadAttention(
(conv_q): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(conv_k): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(conv_v): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(conv_o): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(drop): Dropout(p=0.1, inplace=False)
)
)
(norm_layers_1): ModuleList(
(0-5): 6 x LayerNorm()
)
(ffn_layers): ModuleList(
(0-5): 6 x FFN(
(conv_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,))
(conv_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,))
(drop): Dropout(p=0.1, inplace=False)
)
)
(norm_layers_2): ModuleList(
(0-5): 6 x LayerNorm()
)
)
(text_embedding): Embedding(322, 192)
(mrte): MRTE(
(cross_attention): MultiHeadAttention(
(conv_q): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
(conv_k): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
(conv_v): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
(conv_o): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
(drop): Dropout(p=0.0, inplace=False)
)
(c_pre): Conv1d(192, 512, kernel_size=(1,), stride=(1,))
(text_pre): Conv1d(192, 512, kernel_size=(1,), stride=(1,))
(c_post): Conv1d(512, 192, kernel_size=(1,), stride=(1,))
)
(encoder2): Encoder(
(drop): Dropout(p=0.1, inplace=False)
(attn_layers): ModuleList(
(0-2): 3 x MultiHeadAttention(
(conv_q): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(conv_k): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(conv_v): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(conv_o): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(drop): Dropout(p=0.1, inplace=False)
)
)
(norm_layers_1): ModuleList(
(0-2): 3 x LayerNorm()
)
(ffn_layers): ModuleList(
(0-2): 3 x FFN(
(conv_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,))
(conv_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,))
(drop): Dropout(p=0.1, inplace=False)
)
)
(norm_layers_2): ModuleList(
(0-2): 3 x LayerNorm()
)
)
(proj): Conv1d(192, 384, kernel_size=(1,), stride=(1,))
)
(dec): Generator(
(conv_pre): Conv1d(192, 512, kernel_size=(7,), stride=(1,), padding=(3,))
(ups): ModuleList(
(0): ConvTranspose1d(512, 256, kernel_size=(16,), stride=(10,), padding=(3,))
(1): ConvTranspose1d(256, 128, kernel_size=(16,), stride=(8,), padding=(4,))
(2): ConvTranspose1d(128, 64, kernel_size=(8,), stride=(2,), padding=(3,))
(3): ConvTranspose1d(64, 32, kernel_size=(2,), stride=(2,))
(4): ConvTranspose1d(32, 16, kernel_size=(2,), stride=(2,))
)
(resblocks): ModuleList(
(0): ResBlock1(
(convs1): ModuleList(
(0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
(1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
(2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
)
(convs2): ModuleList(
(0-2): 3 x Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
)
)
(1): ResBlock1(
(convs1): ModuleList(
(0): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,))
(1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
(2): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
)
(convs2): ModuleList(
(0-2): 3 x Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,))
)
)
(2): ResBlock1(
(convs1): ModuleList(
(0): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,))
(1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
(2): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
)
(convs2): ModuleList(
(0-2): 3 x Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,))
)
)
(3): ResBlock1(
(convs1): ModuleList(
(0): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
(1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
(2): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
)
(convs2): ModuleList(
(0-2): 3 x Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
)
)
(4): ResBlock1(
(convs1): ModuleList(
(0): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,))
(1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
(2): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
)
(convs2): ModuleList(
(0-2): 3 x Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,))
)
)
(5): ResBlock1(
(convs1): ModuleList(
(0): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,))
(1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
(2): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
)
(convs2): ModuleList(
(0-2): 3 x Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,))
)
)
(6): ResBlock1(
(convs1): ModuleList(
(0): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
(1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
(2): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
)
(convs2): ModuleList(
(0-2): 3 x Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
)
)
(7): ResBlock1(
(convs1): ModuleList(
(0): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,))
(1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
(2): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
)
(convs2): ModuleList(
(0-2): 3 x Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,))
)
)
(8): ResBlock1(
(convs1): ModuleList(
(0): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,))
(1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
(2): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
)
(convs2): ModuleList(
(0-2): 3 x Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,))
)
)
(9): ResBlock1(
(convs1): ModuleList(
(0): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,))
(1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
(2): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
)
(convs2): ModuleList(
(0-2): 3 x Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,))
)
)
(10): ResBlock1(
(convs1): ModuleList(
(0): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,))
(1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
(2): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
)
(convs2): ModuleList(
(0-2): 3 x Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,))
)
)
(11): ResBlock1(
(convs1): ModuleList(
(0): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,))
(1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
(2): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
)
(convs2): ModuleList(
(0-2): 3 x Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,))
)
)
(12): ResBlock1(
(convs1): ModuleList(
(0): Conv1d(16, 16, kernel_size=(3,), stride=(1,), padding=(1,))
(1): Conv1d(16, 16, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
(2): Conv1d(16, 16, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
)
(convs2): ModuleList(
(0-2): 3 x Conv1d(16, 16, kernel_size=(3,), stride=(1,), padding=(1,))
)
)
(13): ResBlock1(
(convs1): ModuleList(
(0): Conv1d(16, 16, kernel_size=(7,), stride=(1,), padding=(3,))
(1): Conv1d(16, 16, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
(2): Conv1d(16, 16, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
)
(convs2): ModuleList(
(0-2): 3 x Conv1d(16, 16, kernel_size=(7,), stride=(1,), padding=(3,))
)
)
(14): ResBlock1(
(convs1): ModuleList(
(0): Conv1d(16, 16, kernel_size=(11,), stride=(1,), padding=(5,))
(1): Conv1d(16, 16, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
(2): Conv1d(16, 16, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
)
(convs2): ModuleList(
(0-2): 3 x Conv1d(16, 16, kernel_size=(11,), stride=(1,), padding=(5,))
)
)
)
(conv_post): Conv1d(16, 1, kernel_size=(7,), stride=(1,), padding=(3,), bias=False)
(cond): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
)
(enc_q): PosteriorEncoder(
(pre): Conv1d(1025, 192, kernel_size=(1,), stride=(1,))
(enc): WN(
(in_layers): ModuleList(
(0-15): 16 x Conv1d(192, 384, kernel_size=(5,), stride=(1,), padding=(2,))
)
(res_skip_layers): ModuleList(
(0-14): 15 x Conv1d(192, 384, kernel_size=(1,), stride=(1,))
(15): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
)
(drop): Dropout(p=0, inplace=False)
(cond_layer): Conv1d(512, 6144, kernel_size=(1,), stride=(1,))
)
(proj): Conv1d(192, 384, kernel_size=(1,), stride=(1,))
)
(flow): ResidualCouplingBlock(
(flows): ModuleList(
(0): ResidualCouplingLayer(
(pre): Conv1d(96, 192, kernel_size=(1,), stride=(1,))
(enc): WN(
(in_layers): ModuleList(
(0-3): 4 x Conv1d(192, 384, kernel_size=(5,), stride=(1,), padding=(2,))
)
(res_skip_layers): ModuleList(
(0-2): 3 x Conv1d(192, 384, kernel_size=(1,), stride=(1,))
(3): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
)
(drop): Dropout(p=0, inplace=False)
(cond_layer): Conv1d(512, 1536, kernel_size=(1,), stride=(1,))
)
(post): Conv1d(192, 96, kernel_size=(1,), stride=(1,))
)
(1): Flip()
(2): ResidualCouplingLayer(
(pre): Conv1d(96, 192, kernel_size=(1,), stride=(1,))
(enc): WN(
(in_layers): ModuleList(
(0-3): 4 x Conv1d(192, 384, kernel_size=(5,), stride=(1,), padding=(2,))
)
(res_skip_layers): ModuleList(
(0-2): 3 x Conv1d(192, 384, kernel_size=(1,), stride=(1,))
(3): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
)
(drop): Dropout(p=0, inplace=False)
(cond_layer): Conv1d(512, 1536, kernel_size=(1,), stride=(1,))
)
(post): Conv1d(192, 96, kernel_size=(1,), stride=(1,))
)
(3): Flip()
(4): ResidualCouplingLayer(
(pre): Conv1d(96, 192, kernel_size=(1,), stride=(1,))
(enc): WN(
(in_layers): ModuleList(
(0-3): 4 x Conv1d(192, 384, kernel_size=(5,), stride=(1,), padding=(2,))
)
(res_skip_layers): ModuleList(
(0-2): 3 x Conv1d(192, 384, kernel_size=(1,), stride=(1,))
(3): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
)
(drop): Dropout(p=0, inplace=False)
(cond_layer): Conv1d(512, 1536, kernel_size=(1,), stride=(1,))
)
(post): Conv1d(192, 96, kernel_size=(1,), stride=(1,))
)
(5): Flip()
(6): ResidualCouplingLayer(
(pre): Conv1d(96, 192, kernel_size=(1,), stride=(1,))
(enc): WN(
(in_layers): ModuleList(
(0-3): 4 x Conv1d(192, 384, kernel_size=(5,), stride=(1,), padding=(2,))
)
(res_skip_layers): ModuleList(
(0-2): 3 x Conv1d(192, 384, kernel_size=(1,), stride=(1,))
(3): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
)
(drop): Dropout(p=0, inplace=False)
(cond_layer): Conv1d(512, 1536, kernel_size=(1,), stride=(1,))
)
(post): Conv1d(192, 96, kernel_size=(1,), stride=(1,))
)
(7): Flip()
)
)
(ref_enc): MelStyleEncoder(
(spectral): Sequential(
(0): LinearNorm(
(fc): Linear(in_features=1025, out_features=128, bias=True)
)
(1): Mish()
(2): Dropout(p=0.1, inplace=False)
(3): LinearNorm(
(fc): Linear(in_features=128, out_features=128, bias=True)
)
(4): Mish()
(5): Dropout(p=0.1, inplace=False)
)
(temporal): Sequential(
(0): Conv1dGLU(
(conv1): ConvNorm(
(conv): Conv1d(128, 256, kernel_size=(5,), stride=(1,), padding=(2,))
)
(dropout): Dropout(p=0.1, inplace=False)
)
(1): Conv1dGLU(
(conv1): ConvNorm(
(conv): Conv1d(128, 256, kernel_size=(5,), stride=(1,), padding=(2,))
)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(slf_attn): MultiHeadAttention(
(w_qs): Linear(in_features=128, out_features=128, bias=True)
(w_ks): Linear(in_features=128, out_features=128, bias=True)
(w_vs): Linear(in_features=128, out_features=128, bias=True)
(attention): ScaledDotProductAttention(
(softmax): Softmax(dim=2)
(dropout): Dropout(p=0.1, inplace=False)
)
(fc): Linear(in_features=128, out_features=128, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(fc): LinearNorm(
(fc): Linear(in_features=128, out_features=512, bias=True)
)
)
(ssl_proj): Conv1d(768, 768, kernel_size=(2,), stride=(2,))
(quantizer): ResidualVectorQuantizer(
(vq): ResidualVectorQuantization(
(layers): ModuleList(
(0): VectorQuantization(
(project_in): Identity()
(project_out): Identity()
(_codebook): EuclideanCodebook()
)
)
)
)
)