diff --git a/benchmark/benchncnn.cpp b/benchmark/benchncnn.cpp index ac6b77af3ec4..1462c5fb73e5 100644 --- a/benchmark/benchncnn.cpp +++ b/benchmark/benchncnn.cpp @@ -50,14 +50,6 @@ static ncnn::VkAllocator* g_staging_vkallocator = 0; void benchmark(const char* comment, const std::vector& _in, const ncnn::Option& opt, const char* model_param_data = NULL) { - // Skip if int8 model name and using GPU - if (opt.use_vulkan_compute && strstr(comment, "int8") != NULL) - { - if (!model_param_data) - fprintf(stderr, "%20s skipped (int8+GPU not supported)\n", comment); - return; - } - g_blob_pool_allocator.clear(); g_workspace_pool_allocator.clear(); diff --git a/benchmark/models/efficientnet_b0.param b/benchmark/models/efficientnet_b0.param index 0a762d21cbee..c5014a11fe7d 100644 --- a/benchmark/models/efficientnet_b0.param +++ b/benchmark/models/efficientnet_b0.param @@ -9,7 +9,7 @@ Split splitncnn_0 1 2 368 368_splitncnn_0 368_sp Pooling GlobalAveragePool_8 1 1 368_splitncnn_1 369 -23330=4,1,32,1,1 0=1 4=1 InnerProduct Conv_9 1 1 369 370 -23330=4,1,8,1,1 0=8 1=1 2=256 Swish Mul_11 1 1 370 372 -23330=4,1,8,1,1 -Convolution Conv_12 1 1 372 374 -23330=4,1,32,1,1 0=32 1=1 5=1 6=256 9=4 +InnerProduct Conv_12 1 1 372 374 -23330=4,1,32,1,1 0=32 1=1 2=256 9=4 BinaryOp Mul_14 2 1 368_splitncnn_0 374 375 -23330=4,3,112,112,32 0=2 Convolution Conv_15 1 1 375 377 -23330=4,3,112,112,16 0=16 1=1 5=1 6=512 Convolution Conv_17 1 1 377 379 -23330=4,3,112,112,96 0=96 1=1 5=1 6=1536 @@ -20,7 +20,7 @@ Split splitncnn_1 1 2 385 385_splitncnn_0 385_sp Pooling GlobalAveragePool_25 1 1 385_splitncnn_1 386 -23330=4,1,96,1,1 0=1 4=1 InnerProduct Conv_26 1 1 386 387 -23330=4,1,4,1,1 0=4 1=1 2=384 Swish Mul_28 1 1 387 389 -23330=4,1,4,1,1 -Convolution Conv_29 1 1 389 391 -23330=4,1,96,1,1 0=96 1=1 5=1 6=384 9=4 +InnerProduct Conv_29 1 1 389 391 -23330=4,1,96,1,1 0=96 1=1 2=384 9=4 BinaryOp Mul_31 2 1 385_splitncnn_0 391 392 -23330=4,3,56,56,96 0=2 Convolution Conv_32 1 1 392 394 -23330=4,3,56,56,24 0=24 1=1 5=1 6=2304 Split splitncnn_2 1 2 394 394_splitncnn_0 394_splitncnn_1 -23330=8,3,56,56,24,3,56,56,24 @@ -32,7 +32,7 @@ Split splitncnn_3 1 2 402 402_splitncnn_0 402_sp Pooling GlobalAveragePool_42 1 1 402_splitncnn_1 403 -23330=4,1,144,1,1 0=1 4=1 InnerProduct Conv_43 1 1 403 404 -23330=4,1,6,1,1 0=6 1=1 2=864 Swish Mul_45 1 1 404 406 -23330=4,1,6,1,1 -Convolution Conv_46 1 1 406 408 -23330=4,1,144,1,1 0=144 1=1 5=1 6=864 9=4 +InnerProduct Conv_46 1 1 406 408 -23330=4,1,144,1,1 0=144 1=1 2=864 9=4 BinaryOp Mul_48 2 1 402_splitncnn_0 408 409 -23330=4,3,56,56,144 0=2 Convolution Conv_49 1 1 409 411 -23330=4,3,56,56,24 0=24 1=1 5=1 6=3456 BinaryOp Add_51 2 1 394_splitncnn_0 411 412 -23330=4,3,56,56,24 @@ -44,7 +44,7 @@ Split splitncnn_4 1 2 420 420_splitncnn_0 420_sp Pooling GlobalAveragePool_60 1 1 420_splitncnn_1 421 -23330=4,1,144,1,1 0=1 4=1 InnerProduct Conv_61 1 1 421 422 -23330=4,1,6,1,1 0=6 1=1 2=864 Swish Mul_63 1 1 422 424 -23330=4,1,6,1,1 -Convolution Conv_64 1 1 424 426 -23330=4,1,144,1,1 0=144 1=1 5=1 6=864 9=4 +InnerProduct Conv_64 1 1 424 426 -23330=4,1,144,1,1 0=144 1=1 2=864 9=4 BinaryOp Mul_66 2 1 420_splitncnn_0 426 427 -23330=4,3,28,28,144 0=2 Convolution Conv_67 1 1 427 429 -23330=4,3,28,28,40 0=40 1=1 5=1 6=5760 Split splitncnn_5 1 2 429 429_splitncnn_0 429_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40 @@ -56,7 +56,7 @@ Split splitncnn_6 1 2 437 437_splitncnn_0 437_sp Pooling GlobalAveragePool_77 1 1 437_splitncnn_1 438 -23330=4,1,240,1,1 0=1 4=1 InnerProduct Conv_78 1 1 438 439 -23330=4,1,10,1,1 0=10 1=1 2=2400 Swish Mul_80 1 1 439 441 -23330=4,1,10,1,1 -Convolution Conv_81 1 1 441 443 -23330=4,1,240,1,1 0=240 1=1 5=1 6=2400 9=4 +InnerProduct Conv_81 1 1 441 443 -23330=4,1,240,1,1 0=240 1=1 2=2400 9=4 BinaryOp Mul_83 2 1 437_splitncnn_0 443 444 -23330=4,3,28,28,240 0=2 Convolution Conv_84 1 1 444 446 -23330=4,3,28,28,40 0=40 1=1 5=1 6=9600 BinaryOp Add_86 2 1 429_splitncnn_0 446 447 -23330=4,3,28,28,40 @@ -68,7 +68,7 @@ Split splitncnn_7 1 2 455 455_splitncnn_0 455_sp Pooling GlobalAveragePool_95 1 1 455_splitncnn_1 456 -23330=4,1,240,1,1 0=1 4=1 InnerProduct Conv_96 1 1 456 457 -23330=4,1,10,1,1 0=10 1=1 2=2400 Swish Mul_98 1 1 457 459 -23330=4,1,10,1,1 -Convolution Conv_99 1 1 459 461 -23330=4,1,240,1,1 0=240 1=1 5=1 6=2400 9=4 +InnerProduct Conv_99 1 1 459 461 -23330=4,1,240,1,1 0=240 1=1 2=2400 9=4 BinaryOp Mul_101 2 1 455_splitncnn_0 461 462 -23330=4,3,14,14,240 0=2 Convolution Conv_102 1 1 462 464 -23330=4,3,14,14,80 0=80 1=1 5=1 6=19200 Split splitncnn_8 1 2 464 464_splitncnn_0 464_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80 @@ -80,7 +80,7 @@ Split splitncnn_9 1 2 472 472_splitncnn_0 472_sp Pooling GlobalAveragePool_112 1 1 472_splitncnn_1 473 -23330=4,1,480,1,1 0=1 4=1 InnerProduct Conv_113 1 1 473 474 -23330=4,1,20,1,1 0=20 1=1 2=9600 Swish Mul_115 1 1 474 476 -23330=4,1,20,1,1 -Convolution Conv_116 1 1 476 478 -23330=4,1,480,1,1 0=480 1=1 5=1 6=9600 9=4 +InnerProduct Conv_116 1 1 476 478 -23330=4,1,480,1,1 0=480 1=1 2=9600 9=4 BinaryOp Mul_118 2 1 472_splitncnn_0 478 479 -23330=4,3,14,14,480 0=2 Convolution Conv_119 1 1 479 481 -23330=4,3,14,14,80 0=80 1=1 5=1 6=38400 BinaryOp Add_121 2 1 464_splitncnn_0 481 482 -23330=4,3,14,14,80 @@ -93,7 +93,7 @@ Split splitncnn_11 1 2 490 490_splitncnn_0 490_sp Pooling GlobalAveragePool_130 1 1 490_splitncnn_1 491 -23330=4,1,480,1,1 0=1 4=1 InnerProduct Conv_131 1 1 491 492 -23330=4,1,20,1,1 0=20 1=1 2=9600 Swish Mul_133 1 1 492 494 -23330=4,1,20,1,1 -Convolution Conv_134 1 1 494 496 -23330=4,1,480,1,1 0=480 1=1 5=1 6=9600 9=4 +InnerProduct Conv_134 1 1 494 496 -23330=4,1,480,1,1 0=480 1=1 2=9600 9=4 BinaryOp Mul_136 2 1 490_splitncnn_0 496 497 -23330=4,3,14,14,480 0=2 Convolution Conv_137 1 1 497 499 -23330=4,3,14,14,80 0=80 1=1 5=1 6=38400 BinaryOp Add_139 2 1 482_splitncnn_0 499 500 -23330=4,3,14,14,80 @@ -105,7 +105,7 @@ Split splitncnn_12 1 2 508 508_splitncnn_0 508_sp Pooling GlobalAveragePool_148 1 1 508_splitncnn_1 509 -23330=4,1,480,1,1 0=1 4=1 InnerProduct Conv_149 1 1 509 510 -23330=4,1,20,1,1 0=20 1=1 2=9600 Swish Mul_151 1 1 510 512 -23330=4,1,20,1,1 -Convolution Conv_152 1 1 512 514 -23330=4,1,480,1,1 0=480 1=1 5=1 6=9600 9=4 +InnerProduct Conv_152 1 1 512 514 -23330=4,1,480,1,1 0=480 1=1 2=9600 9=4 BinaryOp Mul_154 2 1 508_splitncnn_0 514 515 -23330=4,3,14,14,480 0=2 Convolution Conv_155 1 1 515 517 -23330=4,3,14,14,112 0=112 1=1 5=1 6=53760 Split splitncnn_13 1 2 517 517_splitncnn_0 517_splitncnn_1 -23330=8,3,14,14,112,3,14,14,112 @@ -117,7 +117,7 @@ Split splitncnn_14 1 2 525 525_splitncnn_0 525_sp Pooling GlobalAveragePool_165 1 1 525_splitncnn_1 526 -23330=4,1,672,1,1 0=1 4=1 InnerProduct Conv_166 1 1 526 527 -23330=4,1,28,1,1 0=28 1=1 2=18816 Swish Mul_168 1 1 527 529 -23330=4,1,28,1,1 -Convolution Conv_169 1 1 529 531 -23330=4,1,672,1,1 0=672 1=1 5=1 6=18816 9=4 +InnerProduct Conv_169 1 1 529 531 -23330=4,1,672,1,1 0=672 1=1 2=18816 9=4 BinaryOp Mul_171 2 1 525_splitncnn_0 531 532 -23330=4,3,14,14,672 0=2 Convolution Conv_172 1 1 532 534 -23330=4,3,14,14,112 0=112 1=1 5=1 6=75264 BinaryOp Add_174 2 1 517_splitncnn_0 534 535 -23330=4,3,14,14,112 @@ -130,7 +130,7 @@ Split splitncnn_16 1 2 543 543_splitncnn_0 543_sp Pooling GlobalAveragePool_183 1 1 543_splitncnn_1 544 -23330=4,1,672,1,1 0=1 4=1 InnerProduct Conv_184 1 1 544 545 -23330=4,1,28,1,1 0=28 1=1 2=18816 Swish Mul_186 1 1 545 547 -23330=4,1,28,1,1 -Convolution Conv_187 1 1 547 549 -23330=4,1,672,1,1 0=672 1=1 5=1 6=18816 9=4 +InnerProduct Conv_187 1 1 547 549 -23330=4,1,672,1,1 0=672 1=1 2=18816 9=4 BinaryOp Mul_189 2 1 543_splitncnn_0 549 550 -23330=4,3,14,14,672 0=2 Convolution Conv_190 1 1 550 552 -23330=4,3,14,14,112 0=112 1=1 5=1 6=75264 BinaryOp Add_192 2 1 535_splitncnn_0 552 553 -23330=4,3,14,14,112 @@ -142,7 +142,7 @@ Split splitncnn_17 1 2 561 561_splitncnn_0 561_sp Pooling GlobalAveragePool_201 1 1 561_splitncnn_1 562 -23330=4,1,672,1,1 0=1 4=1 InnerProduct Conv_202 1 1 562 563 -23330=4,1,28,1,1 0=28 1=1 2=18816 Swish Mul_204 1 1 563 565 -23330=4,1,28,1,1 -Convolution Conv_205 1 1 565 567 -23330=4,1,672,1,1 0=672 1=1 5=1 6=18816 9=4 +InnerProduct Conv_205 1 1 565 567 -23330=4,1,672,1,1 0=672 1=1 2=18816 9=4 BinaryOp Mul_207 2 1 561_splitncnn_0 567 568 -23330=4,3,7,7,672 0=2 Convolution Conv_208 1 1 568 570 -23330=4,3,7,7,192 0=192 1=1 5=1 6=129024 Split splitncnn_18 1 2 570 570_splitncnn_0 570_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192 @@ -154,7 +154,7 @@ Split splitncnn_19 1 2 578 578_splitncnn_0 578_sp Pooling GlobalAveragePool_218 1 1 578_splitncnn_1 579 -23330=4,1,1152,1,1 0=1 4=1 InnerProduct Conv_219 1 1 579 580 -23330=4,1,48,1,1 0=48 1=1 2=55296 Swish Mul_221 1 1 580 582 -23330=4,1,48,1,1 -Convolution Conv_222 1 1 582 584 -23330=4,1,1152,1,1 0=1152 1=1 5=1 6=55296 9=4 +InnerProduct Conv_222 1 1 582 584 -23330=4,1,1152,1,1 0=1152 1=1 2=55296 9=4 BinaryOp Mul_224 2 1 578_splitncnn_0 584 585 -23330=4,3,7,7,1152 0=2 Convolution Conv_225 1 1 585 587 -23330=4,3,7,7,192 0=192 1=1 5=1 6=221184 BinaryOp Add_227 2 1 570_splitncnn_0 587 588 -23330=4,3,7,7,192 @@ -167,7 +167,7 @@ Split splitncnn_21 1 2 596 596_splitncnn_0 596_sp Pooling GlobalAveragePool_236 1 1 596_splitncnn_1 597 -23330=4,1,1152,1,1 0=1 4=1 InnerProduct Conv_237 1 1 597 598 -23330=4,1,48,1,1 0=48 1=1 2=55296 Swish Mul_239 1 1 598 600 -23330=4,1,48,1,1 -Convolution Conv_240 1 1 600 602 -23330=4,1,1152,1,1 0=1152 1=1 5=1 6=55296 9=4 +InnerProduct Conv_240 1 1 600 602 -23330=4,1,1152,1,1 0=1152 1=1 2=55296 9=4 BinaryOp Mul_242 2 1 596_splitncnn_0 602 603 -23330=4,3,7,7,1152 0=2 Convolution Conv_243 1 1 603 605 -23330=4,3,7,7,192 0=192 1=1 5=1 6=221184 BinaryOp Add_245 2 1 588_splitncnn_0 605 606 -23330=4,3,7,7,192 @@ -180,7 +180,7 @@ Split splitncnn_23 1 2 614 614_splitncnn_0 614_sp Pooling GlobalAveragePool_254 1 1 614_splitncnn_1 615 -23330=4,1,1152,1,1 0=1 4=1 InnerProduct Conv_255 1 1 615 616 -23330=4,1,48,1,1 0=48 1=1 2=55296 Swish Mul_257 1 1 616 618 -23330=4,1,48,1,1 -Convolution Conv_258 1 1 618 620 -23330=4,1,1152,1,1 0=1152 1=1 5=1 6=55296 9=4 +InnerProduct Conv_258 1 1 618 620 -23330=4,1,1152,1,1 0=1152 1=1 2=55296 9=4 BinaryOp Mul_260 2 1 614_splitncnn_0 620 621 -23330=4,3,7,7,1152 0=2 Convolution Conv_261 1 1 621 623 -23330=4,3,7,7,192 0=192 1=1 5=1 6=221184 BinaryOp Add_263 2 1 606_splitncnn_0 623 624 -23330=4,3,7,7,192 @@ -192,7 +192,7 @@ Split splitncnn_24 1 2 632 632_splitncnn_0 632_sp Pooling GlobalAveragePool_272 1 1 632_splitncnn_1 633 -23330=4,1,1152,1,1 0=1 4=1 InnerProduct Conv_273 1 1 633 634 -23330=4,1,48,1,1 0=48 1=1 2=55296 Swish Mul_275 1 1 634 636 -23330=4,1,48,1,1 -Convolution Conv_276 1 1 636 638 -23330=4,1,1152,1,1 0=1152 1=1 5=1 6=55296 9=4 +InnerProduct Conv_276 1 1 636 638 -23330=4,1,1152,1,1 0=1152 1=1 2=55296 9=4 BinaryOp Mul_278 2 1 632_splitncnn_0 638 639 -23330=4,3,7,7,1152 0=2 Convolution Conv_279 1 1 639 641 -23330=4,3,7,7,320 0=320 1=1 5=1 6=368640 Convolution Conv_281 1 1 641 643 -23330=4,3,7,7,1280 0=1280 1=1 5=1 6=409600 diff --git a/benchmark/models/googlenet_int8.param b/benchmark/models/googlenet_int8.param index baf13855d353..5ed1575ac6d6 100644 --- a/benchmark/models/googlenet_int8.param +++ b/benchmark/models/googlenet_int8.param @@ -1,96 +1,96 @@ 7767517 94 121 -Input data 0 1 data 0=224 1=224 2=3 -Convolution conv1/7x7_s2 1 1 data conv1/7x7_s2_conv1/relu_7x7 0=64 1=7 3=2 4=3 5=1 6=9408 8=2 9=1 -Pooling pool1/3x3_s2 1 1 conv1/7x7_s2_conv1/relu_7x7 pool1/3x3_s2 1=3 2=2 -LRN pool1/norm1 1 1 pool1/3x3_s2 pool1/norm1 2=0.000100 -Convolution conv2/3x3_reduce 1 1 pool1/norm1 conv2/3x3_reduce_conv2/relu_3x3_reduce 0=64 1=1 5=1 6=4096 8=102 9=1 -Convolution conv2/3x3 1 1 conv2/3x3_reduce_conv2/relu_3x3_reduce conv2/3x3_conv2/relu_3x3 0=192 1=3 4=1 5=1 6=110592 8=2 9=1 -LRN conv2/norm2 1 1 conv2/3x3_conv2/relu_3x3 conv2/norm2 2=0.000100 -Pooling pool2/3x3_s2 1 1 conv2/norm2 pool2/3x3_s2 1=3 2=2 -Split splitncnn_0 1 4 pool2/3x3_s2 pool2/3x3_s2_splitncnn_0 pool2/3x3_s2_splitncnn_1 pool2/3x3_s2_splitncnn_2 pool2/3x3_s2_splitncnn_3 -Convolution inception_3a/1x1 1 1 pool2/3x3_s2_splitncnn_3 inception_3a/1x1_inception_3a/relu_1x1 0=64 1=1 5=1 6=12288 8=2 9=1 -Convolution inception_3a/3x3_reduce 1 1 pool2/3x3_s2_splitncnn_2 inception_3a/3x3_reduce_inception_3a/relu_3x3_reduce 0=96 1=1 5=1 6=18432 8=102 9=1 -Convolution inception_3a/3x3 1 1 inception_3a/3x3_reduce_inception_3a/relu_3x3_reduce inception_3a/3x3_inception_3a/relu_3x3 0=128 1=3 4=1 5=1 6=110592 8=2 9=1 -Convolution inception_3a/5x5_reduce 1 1 pool2/3x3_s2_splitncnn_1 inception_3a/5x5_reduce_inception_3a/relu_5x5_reduce 0=16 1=1 5=1 6=3072 8=102 9=1 -Convolution inception_3a/5x5 1 1 inception_3a/5x5_reduce_inception_3a/relu_5x5_reduce inception_3a/5x5_inception_3a/relu_5x5 0=32 1=5 4=2 5=1 6=12800 8=2 9=1 -Pooling inception_3a/pool 1 1 pool2/3x3_s2_splitncnn_0 inception_3a/pool 1=3 3=1 -Convolution inception_3a/pool_proj 1 1 inception_3a/pool inception_3a/pool_proj_inception_3a/relu_pool_proj 0=32 1=1 5=1 6=6144 8=2 9=1 -Concat inception_3a/output 4 1 inception_3a/1x1_inception_3a/relu_1x1 inception_3a/3x3_inception_3a/relu_3x3 inception_3a/5x5_inception_3a/relu_5x5 inception_3a/pool_proj_inception_3a/relu_pool_proj inception_3a/output -Split splitncnn_1 1 4 inception_3a/output inception_3a/output_splitncnn_0 inception_3a/output_splitncnn_1 inception_3a/output_splitncnn_2 inception_3a/output_splitncnn_3 -Convolution inception_3b/1x1 1 1 inception_3a/output_splitncnn_3 inception_3b/1x1_inception_3b/relu_1x1 0=128 1=1 5=1 6=32768 8=2 9=1 -Convolution inception_3b/3x3_reduce 1 1 inception_3a/output_splitncnn_2 inception_3b/3x3_reduce_inception_3b/relu_3x3_reduce 0=128 1=1 5=1 6=32768 8=102 9=1 -Convolution inception_3b/3x3 1 1 inception_3b/3x3_reduce_inception_3b/relu_3x3_reduce inception_3b/3x3_inception_3b/relu_3x3 0=192 1=3 4=1 5=1 6=221184 8=2 9=1 -Convolution inception_3b/5x5_reduce 1 1 inception_3a/output_splitncnn_1 inception_3b/5x5_reduce_inception_3b/relu_5x5_reduce 0=32 1=1 5=1 6=8192 8=102 9=1 -Convolution inception_3b/5x5 1 1 inception_3b/5x5_reduce_inception_3b/relu_5x5_reduce inception_3b/5x5_inception_3b/relu_5x5 0=96 1=5 4=2 5=1 6=76800 8=2 9=1 -Pooling inception_3b/pool 1 1 inception_3a/output_splitncnn_0 inception_3b/pool 1=3 3=1 -Convolution inception_3b/pool_proj 1 1 inception_3b/pool inception_3b/pool_proj_inception_3b/relu_pool_proj 0=64 1=1 5=1 6=16384 8=2 9=1 -Concat inception_3b/output 4 1 inception_3b/1x1_inception_3b/relu_1x1 inception_3b/3x3_inception_3b/relu_3x3 inception_3b/5x5_inception_3b/relu_5x5 inception_3b/pool_proj_inception_3b/relu_pool_proj inception_3b/output -Pooling pool3/3x3_s2 1 1 inception_3b/output pool3/3x3_s2 1=3 2=2 -Split splitncnn_2 1 4 pool3/3x3_s2 pool3/3x3_s2_splitncnn_0 pool3/3x3_s2_splitncnn_1 pool3/3x3_s2_splitncnn_2 pool3/3x3_s2_splitncnn_3 -Convolution inception_4a/1x1 1 1 pool3/3x3_s2_splitncnn_3 inception_4a/1x1_inception_4a/relu_1x1 0=192 1=1 5=1 6=92160 8=2 9=1 -Convolution inception_4a/3x3_reduce 1 1 pool3/3x3_s2_splitncnn_2 inception_4a/3x3_reduce_inception_4a/relu_3x3_reduce 0=96 1=1 5=1 6=46080 8=102 9=1 -Convolution inception_4a/3x3 1 1 inception_4a/3x3_reduce_inception_4a/relu_3x3_reduce inception_4a/3x3_inception_4a/relu_3x3 0=208 1=3 4=1 5=1 6=179712 8=2 9=1 -Convolution inception_4a/5x5_reduce 1 1 pool3/3x3_s2_splitncnn_1 inception_4a/5x5_reduce_inception_4a/relu_5x5_reduce 0=16 1=1 5=1 6=7680 8=102 9=1 -Convolution inception_4a/5x5 1 1 inception_4a/5x5_reduce_inception_4a/relu_5x5_reduce inception_4a/5x5_inception_4a/relu_5x5 0=48 1=5 4=2 5=1 6=19200 8=2 9=1 -Pooling inception_4a/pool 1 1 pool3/3x3_s2_splitncnn_0 inception_4a/pool 1=3 3=1 -Convolution inception_4a/pool_proj 1 1 inception_4a/pool inception_4a/pool_proj_inception_4a/relu_pool_proj 0=64 1=1 5=1 6=30720 8=2 9=1 -Concat inception_4a/output 4 1 inception_4a/1x1_inception_4a/relu_1x1 inception_4a/3x3_inception_4a/relu_3x3 inception_4a/5x5_inception_4a/relu_5x5 inception_4a/pool_proj_inception_4a/relu_pool_proj inception_4a/output -Split splitncnn_3 1 4 inception_4a/output inception_4a/output_splitncnn_0 inception_4a/output_splitncnn_1 inception_4a/output_splitncnn_2 inception_4a/output_splitncnn_3 -Convolution inception_4b/1x1 1 1 inception_4a/output_splitncnn_3 inception_4b/1x1_inception_4b/relu_1x1 0=160 1=1 5=1 6=81920 8=2 9=1 -Convolution inception_4b/3x3_reduce 1 1 inception_4a/output_splitncnn_2 inception_4b/3x3_reduce_inception_4b/relu_3x3_reduce 0=112 1=1 5=1 6=57344 8=102 9=1 -Convolution inception_4b/3x3 1 1 inception_4b/3x3_reduce_inception_4b/relu_3x3_reduce inception_4b/3x3_inception_4b/relu_3x3 0=224 1=3 4=1 5=1 6=225792 8=2 9=1 -Convolution inception_4b/5x5_reduce 1 1 inception_4a/output_splitncnn_1 inception_4b/5x5_reduce_inception_4b/relu_5x5_reduce 0=24 1=1 5=1 6=12288 8=102 9=1 -Convolution inception_4b/5x5 1 1 inception_4b/5x5_reduce_inception_4b/relu_5x5_reduce inception_4b/5x5_inception_4b/relu_5x5 0=64 1=5 4=2 5=1 6=38400 8=2 9=1 -Pooling inception_4b/pool 1 1 inception_4a/output_splitncnn_0 inception_4b/pool 1=3 3=1 -Convolution inception_4b/pool_proj 1 1 inception_4b/pool inception_4b/pool_proj_inception_4b/relu_pool_proj 0=64 1=1 5=1 6=32768 8=2 9=1 -Concat inception_4b/output 4 1 inception_4b/1x1_inception_4b/relu_1x1 inception_4b/3x3_inception_4b/relu_3x3 inception_4b/5x5_inception_4b/relu_5x5 inception_4b/pool_proj_inception_4b/relu_pool_proj inception_4b/output -Split splitncnn_4 1 4 inception_4b/output inception_4b/output_splitncnn_0 inception_4b/output_splitncnn_1 inception_4b/output_splitncnn_2 inception_4b/output_splitncnn_3 -Convolution inception_4c/1x1 1 1 inception_4b/output_splitncnn_3 inception_4c/1x1_inception_4c/relu_1x1 0=128 1=1 5=1 6=65536 8=2 9=1 -Convolution inception_4c/3x3_reduce 1 1 inception_4b/output_splitncnn_2 inception_4c/3x3_reduce_inception_4c/relu_3x3_reduce 0=128 1=1 5=1 6=65536 8=102 9=1 -Convolution inception_4c/3x3 1 1 inception_4c/3x3_reduce_inception_4c/relu_3x3_reduce inception_4c/3x3_inception_4c/relu_3x3 0=256 1=3 4=1 5=1 6=294912 8=2 9=1 -Convolution inception_4c/5x5_reduce 1 1 inception_4b/output_splitncnn_1 inception_4c/5x5_reduce_inception_4c/relu_5x5_reduce 0=24 1=1 5=1 6=12288 8=102 9=1 -Convolution inception_4c/5x5 1 1 inception_4c/5x5_reduce_inception_4c/relu_5x5_reduce inception_4c/5x5_inception_4c/relu_5x5 0=64 1=5 4=2 5=1 6=38400 8=2 9=1 -Pooling inception_4c/pool 1 1 inception_4b/output_splitncnn_0 inception_4c/pool 1=3 3=1 -Convolution inception_4c/pool_proj 1 1 inception_4c/pool inception_4c/pool_proj_inception_4c/relu_pool_proj 0=64 1=1 5=1 6=32768 8=2 9=1 -Concat inception_4c/output 4 1 inception_4c/1x1_inception_4c/relu_1x1 inception_4c/3x3_inception_4c/relu_3x3 inception_4c/5x5_inception_4c/relu_5x5 inception_4c/pool_proj_inception_4c/relu_pool_proj inception_4c/output -Split splitncnn_5 1 4 inception_4c/output inception_4c/output_splitncnn_0 inception_4c/output_splitncnn_1 inception_4c/output_splitncnn_2 inception_4c/output_splitncnn_3 -Convolution inception_4d/1x1 1 1 inception_4c/output_splitncnn_3 inception_4d/1x1_inception_4d/relu_1x1 0=112 1=1 5=1 6=57344 8=2 9=1 -Convolution inception_4d/3x3_reduce 1 1 inception_4c/output_splitncnn_2 inception_4d/3x3_reduce_inception_4d/relu_3x3_reduce 0=144 1=1 5=1 6=73728 8=102 9=1 -Convolution inception_4d/3x3 1 1 inception_4d/3x3_reduce_inception_4d/relu_3x3_reduce inception_4d/3x3_inception_4d/relu_3x3 0=288 1=3 4=1 5=1 6=373248 8=2 9=1 -Convolution inception_4d/5x5_reduce 1 1 inception_4c/output_splitncnn_1 inception_4d/5x5_reduce_inception_4d/relu_5x5_reduce 0=32 1=1 5=1 6=16384 8=102 9=1 -Convolution inception_4d/5x5 1 1 inception_4d/5x5_reduce_inception_4d/relu_5x5_reduce inception_4d/5x5_inception_4d/relu_5x5 0=64 1=5 4=2 5=1 6=51200 8=2 9=1 -Pooling inception_4d/pool 1 1 inception_4c/output_splitncnn_0 inception_4d/pool 1=3 3=1 -Convolution inception_4d/pool_proj 1 1 inception_4d/pool inception_4d/pool_proj_inception_4d/relu_pool_proj 0=64 1=1 5=1 6=32768 8=2 9=1 -Concat inception_4d/output 4 1 inception_4d/1x1_inception_4d/relu_1x1 inception_4d/3x3_inception_4d/relu_3x3 inception_4d/5x5_inception_4d/relu_5x5 inception_4d/pool_proj_inception_4d/relu_pool_proj inception_4d/output -Split splitncnn_6 1 4 inception_4d/output inception_4d/output_splitncnn_0 inception_4d/output_splitncnn_1 inception_4d/output_splitncnn_2 inception_4d/output_splitncnn_3 -Convolution inception_4e/1x1 1 1 inception_4d/output_splitncnn_3 inception_4e/1x1_inception_4e/relu_1x1 0=256 1=1 5=1 6=135168 8=2 9=1 -Convolution inception_4e/3x3_reduce 1 1 inception_4d/output_splitncnn_2 inception_4e/3x3_reduce_inception_4e/relu_3x3_reduce 0=160 1=1 5=1 6=84480 8=102 9=1 -Convolution inception_4e/3x3 1 1 inception_4e/3x3_reduce_inception_4e/relu_3x3_reduce inception_4e/3x3_inception_4e/relu_3x3 0=320 1=3 4=1 5=1 6=460800 8=2 9=1 -Convolution inception_4e/5x5_reduce 1 1 inception_4d/output_splitncnn_1 inception_4e/5x5_reduce_inception_4e/relu_5x5_reduce 0=32 1=1 5=1 6=16896 8=102 9=1 -Convolution inception_4e/5x5 1 1 inception_4e/5x5_reduce_inception_4e/relu_5x5_reduce inception_4e/5x5_inception_4e/relu_5x5 0=128 1=5 4=2 5=1 6=102400 8=2 9=1 -Pooling inception_4e/pool 1 1 inception_4d/output_splitncnn_0 inception_4e/pool 1=3 3=1 -Convolution inception_4e/pool_proj 1 1 inception_4e/pool inception_4e/pool_proj_inception_4e/relu_pool_proj 0=128 1=1 5=1 6=67584 8=2 9=1 -Concat inception_4e/output 4 1 inception_4e/1x1_inception_4e/relu_1x1 inception_4e/3x3_inception_4e/relu_3x3 inception_4e/5x5_inception_4e/relu_5x5 inception_4e/pool_proj_inception_4e/relu_pool_proj inception_4e/output -Pooling pool4/3x3_s2 1 1 inception_4e/output pool4/3x3_s2 1=3 2=2 -Split splitncnn_7 1 4 pool4/3x3_s2 pool4/3x3_s2_splitncnn_0 pool4/3x3_s2_splitncnn_1 pool4/3x3_s2_splitncnn_2 pool4/3x3_s2_splitncnn_3 -Convolution inception_5a/1x1 1 1 pool4/3x3_s2_splitncnn_3 inception_5a/1x1_inception_5a/relu_1x1 0=256 1=1 5=1 6=212992 8=2 9=1 -Convolution inception_5a/3x3_reduce 1 1 pool4/3x3_s2_splitncnn_2 inception_5a/3x3_reduce_inception_5a/relu_3x3_reduce 0=160 1=1 5=1 6=133120 8=102 9=1 -Convolution inception_5a/3x3 1 1 inception_5a/3x3_reduce_inception_5a/relu_3x3_reduce inception_5a/3x3_inception_5a/relu_3x3 0=320 1=3 4=1 5=1 6=460800 8=2 9=1 -Convolution inception_5a/5x5_reduce 1 1 pool4/3x3_s2_splitncnn_1 inception_5a/5x5_reduce_inception_5a/relu_5x5_reduce 0=32 1=1 5=1 6=26624 8=102 9=1 -Convolution inception_5a/5x5 1 1 inception_5a/5x5_reduce_inception_5a/relu_5x5_reduce inception_5a/5x5_inception_5a/relu_5x5 0=128 1=5 4=2 5=1 6=102400 8=2 9=1 -Pooling inception_5a/pool 1 1 pool4/3x3_s2_splitncnn_0 inception_5a/pool 1=3 3=1 -Convolution inception_5a/pool_proj 1 1 inception_5a/pool inception_5a/pool_proj_inception_5a/relu_pool_proj 0=128 1=1 5=1 6=106496 8=2 9=1 -Concat inception_5a/output 4 1 inception_5a/1x1_inception_5a/relu_1x1 inception_5a/3x3_inception_5a/relu_3x3 inception_5a/5x5_inception_5a/relu_5x5 inception_5a/pool_proj_inception_5a/relu_pool_proj inception_5a/output -Split splitncnn_8 1 4 inception_5a/output inception_5a/output_splitncnn_0 inception_5a/output_splitncnn_1 inception_5a/output_splitncnn_2 inception_5a/output_splitncnn_3 -Convolution inception_5b/1x1 1 1 inception_5a/output_splitncnn_3 inception_5b/1x1_inception_5b/relu_1x1 0=384 1=1 5=1 6=319488 8=2 9=1 -Convolution inception_5b/3x3_reduce 1 1 inception_5a/output_splitncnn_2 inception_5b/3x3_reduce_inception_5b/relu_3x3_reduce 0=192 1=1 5=1 6=159744 8=102 9=1 -Convolution inception_5b/3x3 1 1 inception_5b/3x3_reduce_inception_5b/relu_3x3_reduce inception_5b/3x3_inception_5b/relu_3x3 0=384 1=3 4=1 5=1 6=663552 8=2 9=1 -Convolution inception_5b/5x5_reduce 1 1 inception_5a/output_splitncnn_1 inception_5b/5x5_reduce_inception_5b/relu_5x5_reduce 0=48 1=1 5=1 6=39936 8=102 9=1 -Convolution inception_5b/5x5 1 1 inception_5b/5x5_reduce_inception_5b/relu_5x5_reduce inception_5b/5x5_inception_5b/relu_5x5 0=128 1=5 4=2 5=1 6=153600 8=2 9=1 -Pooling inception_5b/pool 1 1 inception_5a/output_splitncnn_0 inception_5b/pool 1=3 3=1 -Convolution inception_5b/pool_proj 1 1 inception_5b/pool inception_5b/pool_proj_inception_5b/relu_pool_proj 0=128 1=1 5=1 6=106496 8=2 9=1 -Concat inception_5b/output 4 1 inception_5b/1x1_inception_5b/relu_1x1 inception_5b/3x3_inception_5b/relu_3x3 inception_5b/5x5_inception_5b/relu_5x5 inception_5b/pool_proj_inception_5b/relu_pool_proj inception_5b/output -Pooling pool5/7x7_s1 1 1 inception_5b/output pool5/7x7_s1_pool5/drop_7x7_s1 0=1 1=7 -InnerProduct loss3/classifier 1 1 pool5/7x7_s1_pool5/drop_7x7_s1 loss3/classifier 0=1000 1=1 2=1024000 -Softmax prob 1 1 loss3/classifier output +Input data 0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3 +Convolution conv1/7x7_s2 1 1 data conv1/7x7_s2_conv1/relu_7x7 -23330=4,3,112,112,64 0=64 1=7 3=2 4=3 5=1 6=9408 8=2 9=1 +Pooling pool1/3x3_s2 1 1 conv1/7x7_s2_conv1/relu_7x7 pool1/3x3_s2 -23330=4,3,56,56,64 1=3 2=2 +LRN pool1/norm1 1 1 pool1/3x3_s2 pool1/norm1 -23330=4,3,56,56,64 2=0.000100 +Convolution conv2/3x3_reduce 1 1 pool1/norm1 conv2/3x3_reduce_conv2/relu_3x3_reduce -23330=4,3,56,56,64 0=64 1=1 5=1 6=4096 8=102 9=1 +Convolution conv2/3x3 1 1 conv2/3x3_reduce_conv2/relu_3x3_reduce conv2/3x3_conv2/relu_3x3 -23330=4,3,56,56,192 0=192 1=3 4=1 5=1 6=110592 8=2 9=1 +LRN conv2/norm2 1 1 conv2/3x3_conv2/relu_3x3 conv2/norm2 -23330=4,3,56,56,192 2=0.000100 +Pooling pool2/3x3_s2 1 1 conv2/norm2 pool2/3x3_s2 -23330=4,3,28,28,192 1=3 2=2 +Split splitncnn_0 1 4 pool2/3x3_s2 pool2/3x3_s2_splitncnn_0 pool2/3x3_s2_splitncnn_1 pool2/3x3_s2_splitncnn_2 pool2/3x3_s2_splitncnn_3 -23330=16,3,28,28,192,3,28,28,192,3,28,28,192,3,28,28,192 +Convolution inception_3a/1x1 1 1 pool2/3x3_s2_splitncnn_3 inception_3a/1x1_inception_3a/relu_1x1 -23330=4,3,28,28,64 0=64 1=1 5=1 6=12288 8=2 9=1 +Convolution inception_3a/3x3_reduce 1 1 pool2/3x3_s2_splitncnn_2 inception_3a/3x3_reduce_inception_3a/relu_3x3_reduce -23330=4,3,28,28,96 0=96 1=1 5=1 6=18432 8=102 9=1 +Convolution inception_3a/3x3 1 1 inception_3a/3x3_reduce_inception_3a/relu_3x3_reduce inception_3a/3x3_inception_3a/relu_3x3 -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=110592 8=2 9=1 +Convolution inception_3a/5x5_reduce 1 1 pool2/3x3_s2_splitncnn_1 inception_3a/5x5_reduce_inception_3a/relu_5x5_reduce -23330=4,3,28,28,16 0=16 1=1 5=1 6=3072 8=102 9=1 +Convolution inception_3a/5x5 1 1 inception_3a/5x5_reduce_inception_3a/relu_5x5_reduce inception_3a/5x5_inception_3a/relu_5x5 -23330=4,3,28,28,32 0=32 1=5 4=2 5=1 6=12800 8=2 9=1 +Pooling inception_3a/pool 1 1 pool2/3x3_s2_splitncnn_0 inception_3a/pool -23330=4,3,28,28,192 1=3 3=1 +Convolution inception_3a/pool_proj 1 1 inception_3a/pool inception_3a/pool_proj_inception_3a/relu_pool_proj -23330=4,3,28,28,32 0=32 1=1 5=1 6=6144 8=2 9=1 +Concat inception_3a/output 4 1 inception_3a/1x1_inception_3a/relu_1x1 inception_3a/3x3_inception_3a/relu_3x3 inception_3a/5x5_inception_3a/relu_5x5 inception_3a/pool_proj_inception_3a/relu_pool_proj inception_3a/output -23330=4,3,28,28,256 +Split splitncnn_1 1 4 inception_3a/output inception_3a/output_splitncnn_0 inception_3a/output_splitncnn_1 inception_3a/output_splitncnn_2 inception_3a/output_splitncnn_3 -23330=16,3,28,28,256,3,28,28,256,3,28,28,256,3,28,28,256 +Convolution inception_3b/1x1 1 1 inception_3a/output_splitncnn_3 inception_3b/1x1_inception_3b/relu_1x1 -23330=4,3,28,28,128 0=128 1=1 5=1 6=32768 8=2 9=1 +Convolution inception_3b/3x3_reduce 1 1 inception_3a/output_splitncnn_2 inception_3b/3x3_reduce_inception_3b/relu_3x3_reduce -23330=4,3,28,28,128 0=128 1=1 5=1 6=32768 8=102 9=1 +Convolution inception_3b/3x3 1 1 inception_3b/3x3_reduce_inception_3b/relu_3x3_reduce inception_3b/3x3_inception_3b/relu_3x3 -23330=4,3,28,28,192 0=192 1=3 4=1 5=1 6=221184 8=2 9=1 +Convolution inception_3b/5x5_reduce 1 1 inception_3a/output_splitncnn_1 inception_3b/5x5_reduce_inception_3b/relu_5x5_reduce -23330=4,3,28,28,32 0=32 1=1 5=1 6=8192 8=102 9=1 +Convolution inception_3b/5x5 1 1 inception_3b/5x5_reduce_inception_3b/relu_5x5_reduce inception_3b/5x5_inception_3b/relu_5x5 -23330=4,3,28,28,96 0=96 1=5 4=2 5=1 6=76800 8=2 9=1 +Pooling inception_3b/pool 1 1 inception_3a/output_splitncnn_0 inception_3b/pool -23330=4,3,28,28,256 1=3 3=1 +Convolution inception_3b/pool_proj 1 1 inception_3b/pool inception_3b/pool_proj_inception_3b/relu_pool_proj -23330=4,3,28,28,64 0=64 1=1 5=1 6=16384 8=2 9=1 +Concat inception_3b/output 4 1 inception_3b/1x1_inception_3b/relu_1x1 inception_3b/3x3_inception_3b/relu_3x3 inception_3b/5x5_inception_3b/relu_5x5 inception_3b/pool_proj_inception_3b/relu_pool_proj inception_3b/output -23330=4,3,28,28,480 +Pooling pool3/3x3_s2 1 1 inception_3b/output pool3/3x3_s2 -23330=4,3,14,14,480 1=3 2=2 +Split splitncnn_2 1 4 pool3/3x3_s2 pool3/3x3_s2_splitncnn_0 pool3/3x3_s2_splitncnn_1 pool3/3x3_s2_splitncnn_2 pool3/3x3_s2_splitncnn_3 -23330=16,3,14,14,480,3,14,14,480,3,14,14,480,3,14,14,480 +Convolution inception_4a/1x1 1 1 pool3/3x3_s2_splitncnn_3 inception_4a/1x1_inception_4a/relu_1x1 -23330=4,3,14,14,192 0=192 1=1 5=1 6=92160 8=2 9=1 +Convolution inception_4a/3x3_reduce 1 1 pool3/3x3_s2_splitncnn_2 inception_4a/3x3_reduce_inception_4a/relu_3x3_reduce -23330=4,3,14,14,96 0=96 1=1 5=1 6=46080 8=102 9=1 +Convolution inception_4a/3x3 1 1 inception_4a/3x3_reduce_inception_4a/relu_3x3_reduce inception_4a/3x3_inception_4a/relu_3x3 -23330=4,3,14,14,208 0=208 1=3 4=1 5=1 6=179712 8=2 9=1 +Convolution inception_4a/5x5_reduce 1 1 pool3/3x3_s2_splitncnn_1 inception_4a/5x5_reduce_inception_4a/relu_5x5_reduce -23330=4,3,14,14,16 0=16 1=1 5=1 6=7680 8=102 9=1 +Convolution inception_4a/5x5 1 1 inception_4a/5x5_reduce_inception_4a/relu_5x5_reduce inception_4a/5x5_inception_4a/relu_5x5 -23330=4,3,14,14,48 0=48 1=5 4=2 5=1 6=19200 8=2 9=1 +Pooling inception_4a/pool 1 1 pool3/3x3_s2_splitncnn_0 inception_4a/pool -23330=4,3,14,14,480 1=3 3=1 +Convolution inception_4a/pool_proj 1 1 inception_4a/pool inception_4a/pool_proj_inception_4a/relu_pool_proj -23330=4,3,14,14,64 0=64 1=1 5=1 6=30720 8=2 9=1 +Concat inception_4a/output 4 1 inception_4a/1x1_inception_4a/relu_1x1 inception_4a/3x3_inception_4a/relu_3x3 inception_4a/5x5_inception_4a/relu_5x5 inception_4a/pool_proj_inception_4a/relu_pool_proj inception_4a/output -23330=4,3,14,14,512 +Split splitncnn_3 1 4 inception_4a/output inception_4a/output_splitncnn_0 inception_4a/output_splitncnn_1 inception_4a/output_splitncnn_2 inception_4a/output_splitncnn_3 -23330=16,3,14,14,512,3,14,14,512,3,14,14,512,3,14,14,512 +Convolution inception_4b/1x1 1 1 inception_4a/output_splitncnn_3 inception_4b/1x1_inception_4b/relu_1x1 -23330=4,3,14,14,160 0=160 1=1 5=1 6=81920 8=2 9=1 +Convolution inception_4b/3x3_reduce 1 1 inception_4a/output_splitncnn_2 inception_4b/3x3_reduce_inception_4b/relu_3x3_reduce -23330=4,3,14,14,112 0=112 1=1 5=1 6=57344 8=102 9=1 +Convolution inception_4b/3x3 1 1 inception_4b/3x3_reduce_inception_4b/relu_3x3_reduce inception_4b/3x3_inception_4b/relu_3x3 -23330=4,3,14,14,224 0=224 1=3 4=1 5=1 6=225792 8=2 9=1 +Convolution inception_4b/5x5_reduce 1 1 inception_4a/output_splitncnn_1 inception_4b/5x5_reduce_inception_4b/relu_5x5_reduce -23330=4,3,14,14,24 0=24 1=1 5=1 6=12288 8=102 9=1 +Convolution inception_4b/5x5 1 1 inception_4b/5x5_reduce_inception_4b/relu_5x5_reduce inception_4b/5x5_inception_4b/relu_5x5 -23330=4,3,14,14,64 0=64 1=5 4=2 5=1 6=38400 8=2 9=1 +Pooling inception_4b/pool 1 1 inception_4a/output_splitncnn_0 inception_4b/pool -23330=4,3,14,14,512 1=3 3=1 +Convolution inception_4b/pool_proj 1 1 inception_4b/pool inception_4b/pool_proj_inception_4b/relu_pool_proj -23330=4,3,14,14,64 0=64 1=1 5=1 6=32768 8=2 9=1 +Concat inception_4b/output 4 1 inception_4b/1x1_inception_4b/relu_1x1 inception_4b/3x3_inception_4b/relu_3x3 inception_4b/5x5_inception_4b/relu_5x5 inception_4b/pool_proj_inception_4b/relu_pool_proj inception_4b/output -23330=4,3,14,14,512 +Split splitncnn_4 1 4 inception_4b/output inception_4b/output_splitncnn_0 inception_4b/output_splitncnn_1 inception_4b/output_splitncnn_2 inception_4b/output_splitncnn_3 -23330=16,3,14,14,512,3,14,14,512,3,14,14,512,3,14,14,512 +Convolution inception_4c/1x1 1 1 inception_4b/output_splitncnn_3 inception_4c/1x1_inception_4c/relu_1x1 -23330=4,3,14,14,128 0=128 1=1 5=1 6=65536 8=2 9=1 +Convolution inception_4c/3x3_reduce 1 1 inception_4b/output_splitncnn_2 inception_4c/3x3_reduce_inception_4c/relu_3x3_reduce -23330=4,3,14,14,128 0=128 1=1 5=1 6=65536 8=102 9=1 +Convolution inception_4c/3x3 1 1 inception_4c/3x3_reduce_inception_4c/relu_3x3_reduce inception_4c/3x3_inception_4c/relu_3x3 -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=294912 8=2 9=1 +Convolution inception_4c/5x5_reduce 1 1 inception_4b/output_splitncnn_1 inception_4c/5x5_reduce_inception_4c/relu_5x5_reduce -23330=4,3,14,14,24 0=24 1=1 5=1 6=12288 8=102 9=1 +Convolution inception_4c/5x5 1 1 inception_4c/5x5_reduce_inception_4c/relu_5x5_reduce inception_4c/5x5_inception_4c/relu_5x5 -23330=4,3,14,14,64 0=64 1=5 4=2 5=1 6=38400 8=2 9=1 +Pooling inception_4c/pool 1 1 inception_4b/output_splitncnn_0 inception_4c/pool -23330=4,3,14,14,512 1=3 3=1 +Convolution inception_4c/pool_proj 1 1 inception_4c/pool inception_4c/pool_proj_inception_4c/relu_pool_proj -23330=4,3,14,14,64 0=64 1=1 5=1 6=32768 8=2 9=1 +Concat inception_4c/output 4 1 inception_4c/1x1_inception_4c/relu_1x1 inception_4c/3x3_inception_4c/relu_3x3 inception_4c/5x5_inception_4c/relu_5x5 inception_4c/pool_proj_inception_4c/relu_pool_proj inception_4c/output -23330=4,3,14,14,512 +Split splitncnn_5 1 4 inception_4c/output inception_4c/output_splitncnn_0 inception_4c/output_splitncnn_1 inception_4c/output_splitncnn_2 inception_4c/output_splitncnn_3 -23330=16,3,14,14,512,3,14,14,512,3,14,14,512,3,14,14,512 +Convolution inception_4d/1x1 1 1 inception_4c/output_splitncnn_3 inception_4d/1x1_inception_4d/relu_1x1 -23330=4,3,14,14,112 0=112 1=1 5=1 6=57344 8=2 9=1 +Convolution inception_4d/3x3_reduce 1 1 inception_4c/output_splitncnn_2 inception_4d/3x3_reduce_inception_4d/relu_3x3_reduce -23330=4,3,14,14,144 0=144 1=1 5=1 6=73728 8=102 9=1 +Convolution inception_4d/3x3 1 1 inception_4d/3x3_reduce_inception_4d/relu_3x3_reduce inception_4d/3x3_inception_4d/relu_3x3 -23330=4,3,14,14,288 0=288 1=3 4=1 5=1 6=373248 8=2 9=1 +Convolution inception_4d/5x5_reduce 1 1 inception_4c/output_splitncnn_1 inception_4d/5x5_reduce_inception_4d/relu_5x5_reduce -23330=4,3,14,14,32 0=32 1=1 5=1 6=16384 8=102 9=1 +Convolution inception_4d/5x5 1 1 inception_4d/5x5_reduce_inception_4d/relu_5x5_reduce inception_4d/5x5_inception_4d/relu_5x5 -23330=4,3,14,14,64 0=64 1=5 4=2 5=1 6=51200 8=2 9=1 +Pooling inception_4d/pool 1 1 inception_4c/output_splitncnn_0 inception_4d/pool -23330=4,3,14,14,512 1=3 3=1 +Convolution inception_4d/pool_proj 1 1 inception_4d/pool inception_4d/pool_proj_inception_4d/relu_pool_proj -23330=4,3,14,14,64 0=64 1=1 5=1 6=32768 8=2 9=1 +Concat inception_4d/output 4 1 inception_4d/1x1_inception_4d/relu_1x1 inception_4d/3x3_inception_4d/relu_3x3 inception_4d/5x5_inception_4d/relu_5x5 inception_4d/pool_proj_inception_4d/relu_pool_proj inception_4d/output -23330=4,3,14,14,528 +Split splitncnn_6 1 4 inception_4d/output inception_4d/output_splitncnn_0 inception_4d/output_splitncnn_1 inception_4d/output_splitncnn_2 inception_4d/output_splitncnn_3 -23330=16,3,14,14,528,3,14,14,528,3,14,14,528,3,14,14,528 +Convolution inception_4e/1x1 1 1 inception_4d/output_splitncnn_3 inception_4e/1x1_inception_4e/relu_1x1 -23330=4,3,14,14,256 0=256 1=1 5=1 6=135168 8=2 9=1 +Convolution inception_4e/3x3_reduce 1 1 inception_4d/output_splitncnn_2 inception_4e/3x3_reduce_inception_4e/relu_3x3_reduce -23330=4,3,14,14,160 0=160 1=1 5=1 6=84480 8=102 9=1 +Convolution inception_4e/3x3 1 1 inception_4e/3x3_reduce_inception_4e/relu_3x3_reduce inception_4e/3x3_inception_4e/relu_3x3 -23330=4,3,14,14,320 0=320 1=3 4=1 5=1 6=460800 8=2 9=1 +Convolution inception_4e/5x5_reduce 1 1 inception_4d/output_splitncnn_1 inception_4e/5x5_reduce_inception_4e/relu_5x5_reduce -23330=4,3,14,14,32 0=32 1=1 5=1 6=16896 8=102 9=1 +Convolution inception_4e/5x5 1 1 inception_4e/5x5_reduce_inception_4e/relu_5x5_reduce inception_4e/5x5_inception_4e/relu_5x5 -23330=4,3,14,14,128 0=128 1=5 4=2 5=1 6=102400 8=2 9=1 +Pooling inception_4e/pool 1 1 inception_4d/output_splitncnn_0 inception_4e/pool -23330=4,3,14,14,528 1=3 3=1 +Convolution inception_4e/pool_proj 1 1 inception_4e/pool inception_4e/pool_proj_inception_4e/relu_pool_proj -23330=4,3,14,14,128 0=128 1=1 5=1 6=67584 8=2 9=1 +Concat inception_4e/output 4 1 inception_4e/1x1_inception_4e/relu_1x1 inception_4e/3x3_inception_4e/relu_3x3 inception_4e/5x5_inception_4e/relu_5x5 inception_4e/pool_proj_inception_4e/relu_pool_proj inception_4e/output -23330=4,3,14,14,832 +Pooling pool4/3x3_s2 1 1 inception_4e/output pool4/3x3_s2 -23330=4,3,7,7,832 1=3 2=2 +Split splitncnn_7 1 4 pool4/3x3_s2 pool4/3x3_s2_splitncnn_0 pool4/3x3_s2_splitncnn_1 pool4/3x3_s2_splitncnn_2 pool4/3x3_s2_splitncnn_3 -23330=16,3,7,7,832,3,7,7,832,3,7,7,832,3,7,7,832 +Convolution inception_5a/1x1 1 1 pool4/3x3_s2_splitncnn_3 inception_5a/1x1_inception_5a/relu_1x1 -23330=4,3,7,7,256 0=256 1=1 5=1 6=212992 8=2 9=1 +Convolution inception_5a/3x3_reduce 1 1 pool4/3x3_s2_splitncnn_2 inception_5a/3x3_reduce_inception_5a/relu_3x3_reduce -23330=4,3,7,7,160 0=160 1=1 5=1 6=133120 8=102 9=1 +Convolution inception_5a/3x3 1 1 inception_5a/3x3_reduce_inception_5a/relu_3x3_reduce inception_5a/3x3_inception_5a/relu_3x3 -23330=4,3,7,7,320 0=320 1=3 4=1 5=1 6=460800 8=2 9=1 +Convolution inception_5a/5x5_reduce 1 1 pool4/3x3_s2_splitncnn_1 inception_5a/5x5_reduce_inception_5a/relu_5x5_reduce -23330=4,3,7,7,32 0=32 1=1 5=1 6=26624 8=102 9=1 +Convolution inception_5a/5x5 1 1 inception_5a/5x5_reduce_inception_5a/relu_5x5_reduce inception_5a/5x5_inception_5a/relu_5x5 -23330=4,3,7,7,128 0=128 1=5 4=2 5=1 6=102400 8=2 9=1 +Pooling inception_5a/pool 1 1 pool4/3x3_s2_splitncnn_0 inception_5a/pool -23330=4,3,7,7,832 1=3 3=1 +Convolution inception_5a/pool_proj 1 1 inception_5a/pool inception_5a/pool_proj_inception_5a/relu_pool_proj -23330=4,3,7,7,128 0=128 1=1 5=1 6=106496 8=2 9=1 +Concat inception_5a/output 4 1 inception_5a/1x1_inception_5a/relu_1x1 inception_5a/3x3_inception_5a/relu_3x3 inception_5a/5x5_inception_5a/relu_5x5 inception_5a/pool_proj_inception_5a/relu_pool_proj inception_5a/output -23330=4,3,7,7,832 +Split splitncnn_8 1 4 inception_5a/output inception_5a/output_splitncnn_0 inception_5a/output_splitncnn_1 inception_5a/output_splitncnn_2 inception_5a/output_splitncnn_3 -23330=16,3,7,7,832,3,7,7,832,3,7,7,832,3,7,7,832 +Convolution inception_5b/1x1 1 1 inception_5a/output_splitncnn_3 inception_5b/1x1_inception_5b/relu_1x1 -23330=4,3,7,7,384 0=384 1=1 5=1 6=319488 8=2 9=1 +Convolution inception_5b/3x3_reduce 1 1 inception_5a/output_splitncnn_2 inception_5b/3x3_reduce_inception_5b/relu_3x3_reduce -23330=4,3,7,7,192 0=192 1=1 5=1 6=159744 8=102 9=1 +Convolution inception_5b/3x3 1 1 inception_5b/3x3_reduce_inception_5b/relu_3x3_reduce inception_5b/3x3_inception_5b/relu_3x3 -23330=4,3,7,7,384 0=384 1=3 4=1 5=1 6=663552 8=2 9=1 +Convolution inception_5b/5x5_reduce 1 1 inception_5a/output_splitncnn_1 inception_5b/5x5_reduce_inception_5b/relu_5x5_reduce -23330=4,3,7,7,48 0=48 1=1 5=1 6=39936 8=102 9=1 +Convolution inception_5b/5x5 1 1 inception_5b/5x5_reduce_inception_5b/relu_5x5_reduce inception_5b/5x5_inception_5b/relu_5x5 -23330=4,3,7,7,128 0=128 1=5 4=2 5=1 6=153600 8=2 9=1 +Pooling inception_5b/pool 1 1 inception_5a/output_splitncnn_0 inception_5b/pool -23330=4,3,7,7,832 1=3 3=1 +Convolution inception_5b/pool_proj 1 1 inception_5b/pool inception_5b/pool_proj_inception_5b/relu_pool_proj -23330=4,3,7,7,128 0=128 1=1 5=1 6=106496 8=2 9=1 +Concat inception_5b/output 4 1 inception_5b/1x1_inception_5b/relu_1x1 inception_5b/3x3_inception_5b/relu_3x3 inception_5b/5x5_inception_5b/relu_5x5 inception_5b/pool_proj_inception_5b/relu_pool_proj inception_5b/output -23330=4,3,7,7,1024 +Pooling pool5/7x7_s1 1 1 inception_5b/output pool5/7x7_s1_pool5/drop_7x7_s1 -23330=4,3,1,1,1024 0=1 1=7 +InnerProduct loss3/classifier 1 1 pool5/7x7_s1_pool5/drop_7x7_s1 loss3/classifier -23330=4,1,1000,1,1 0=1000 1=1 2=1024000 +Softmax prob 1 1 loss3/classifier output -23330=4,1,1000,1,1 diff --git a/benchmark/models/mobilenet_int8.param b/benchmark/models/mobilenet_int8.param index 12c63e0f5e30..f5a078b2d2d6 100644 --- a/benchmark/models/mobilenet_int8.param +++ b/benchmark/models/mobilenet_int8.param @@ -1,33 +1,33 @@ 7767517 31 31 -Input data 0 1 data 0=224 1=224 2=3 -Convolution conv1 1 1 data conv1_relu1 0=32 1=3 3=2 4=1 5=1 6=864 8=102 9=1 -ConvolutionDepthWise conv2_1/dw 1 1 conv1_relu1 conv2_1/dw_relu2_1/dw 0=32 1=3 4=1 5=1 6=288 7=32 8=101 9=1 -Convolution conv2_1/sep 1 1 conv2_1/dw_relu2_1/dw conv2_1/sep_relu2_1/sep 0=64 1=1 5=1 6=2048 8=102 9=1 -ConvolutionDepthWise conv2_2/dw 1 1 conv2_1/sep_relu2_1/sep conv2_2/dw_relu2_2/dw 0=64 1=3 3=2 4=1 5=1 6=576 7=64 8=101 9=1 -Convolution conv2_2/sep 1 1 conv2_2/dw_relu2_2/dw conv2_2/sep_relu2_2/sep 0=128 1=1 5=1 6=8192 8=102 9=1 -ConvolutionDepthWise conv3_1/dw 1 1 conv2_2/sep_relu2_2/sep conv3_1/dw_relu3_1/dw 0=128 1=3 4=1 5=1 6=1152 7=128 8=101 9=1 -Convolution conv3_1/sep 1 1 conv3_1/dw_relu3_1/dw conv3_1/sep_relu3_1/sep 0=128 1=1 5=1 6=16384 8=102 9=1 -ConvolutionDepthWise conv3_2/dw 1 1 conv3_1/sep_relu3_1/sep conv3_2/dw_relu3_2/dw 0=128 1=3 3=2 4=1 5=1 6=1152 7=128 8=101 9=1 -Convolution conv3_2/sep 1 1 conv3_2/dw_relu3_2/dw conv3_2/sep_relu3_2/sep 0=256 1=1 5=1 6=32768 8=102 9=1 -ConvolutionDepthWise conv4_1/dw 1 1 conv3_2/sep_relu3_2/sep conv4_1/dw_relu4_1/dw 0=256 1=3 4=1 5=1 6=2304 7=256 8=101 9=1 -Convolution conv4_1/sep 1 1 conv4_1/dw_relu4_1/dw conv4_1/sep_relu4_1/sep 0=256 1=1 5=1 6=65536 8=102 9=1 -ConvolutionDepthWise conv4_2/dw 1 1 conv4_1/sep_relu4_1/sep conv4_2/dw_relu4_2/dw 0=256 1=3 3=2 4=1 5=1 6=2304 7=256 8=101 9=1 -Convolution conv4_2/sep 1 1 conv4_2/dw_relu4_2/dw conv4_2/sep_relu4_2/sep 0=512 1=1 5=1 6=131072 8=102 9=1 -ConvolutionDepthWise conv5_1/dw 1 1 conv4_2/sep_relu4_2/sep conv5_1/dw_relu5_1/dw 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 -Convolution conv5_1/sep 1 1 conv5_1/dw_relu5_1/dw conv5_1/sep_relu5_1/sep 0=512 1=1 5=1 6=262144 8=102 9=1 -ConvolutionDepthWise conv5_2/dw 1 1 conv5_1/sep_relu5_1/sep conv5_2/dw_relu5_2/dw 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 -Convolution conv5_2/sep 1 1 conv5_2/dw_relu5_2/dw conv5_2/sep_relu5_2/sep 0=512 1=1 5=1 6=262144 8=102 9=1 -ConvolutionDepthWise conv5_3/dw 1 1 conv5_2/sep_relu5_2/sep conv5_3/dw_relu5_3/dw 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 -Convolution conv5_3/sep 1 1 conv5_3/dw_relu5_3/dw conv5_3/sep_relu5_3/sep 0=512 1=1 5=1 6=262144 8=102 9=1 -ConvolutionDepthWise conv5_4/dw 1 1 conv5_3/sep_relu5_3/sep conv5_4/dw_relu5_4/dw 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 -Convolution conv5_4/sep 1 1 conv5_4/dw_relu5_4/dw conv5_4/sep_relu5_4/sep 0=512 1=1 5=1 6=262144 8=102 9=1 -ConvolutionDepthWise conv5_5/dw 1 1 conv5_4/sep_relu5_4/sep conv5_5/dw_relu5_5/dw 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 -Convolution conv5_5/sep 1 1 conv5_5/dw_relu5_5/dw conv5_5/sep_relu5_5/sep 0=512 1=1 5=1 6=262144 8=102 9=1 -ConvolutionDepthWise conv5_6/dw 1 1 conv5_5/sep_relu5_5/sep conv5_6/dw_relu5_6/dw 0=512 1=3 3=2 4=1 5=1 6=4608 7=512 8=101 9=1 -Convolution conv5_6/sep 1 1 conv5_6/dw_relu5_6/dw conv5_6/sep_relu5_6/sep 0=1024 1=1 5=1 6=524288 8=102 9=1 -ConvolutionDepthWise conv6/dw 1 1 conv5_6/sep_relu5_6/sep conv6/dw_relu6/dw 0=1024 1=3 4=1 5=1 6=9216 7=1024 8=101 9=1 -Convolution conv6/sep 1 1 conv6/dw_relu6/dw conv6/sep_relu6/sep 0=1024 1=1 5=1 6=1048576 8=2 9=1 -Pooling pool6 1 1 conv6/sep_relu6/sep pool6 0=1 4=1 -InnerProduct fc7 1 1 pool6 fc7 0=1000 1=1 2=1024000 8=2 -Softmax prob 1 1 fc7 output +Input data 0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3 +Convolution conv1 1 1 data conv1_relu1 -23330=4,3,112,112,32 0=32 1=3 3=2 4=1 5=1 6=864 8=102 9=1 +ConvolutionDepthWise conv2_1/dw 1 1 conv1_relu1 conv2_1/dw_relu2_1/dw -23330=4,3,112,112,32 0=32 1=3 4=1 5=1 6=288 7=32 8=101 9=1 +Convolution conv2_1/sep 1 1 conv2_1/dw_relu2_1/dw conv2_1/sep_relu2_1/sep -23330=4,3,112,112,64 0=64 1=1 5=1 6=2048 8=102 9=1 +ConvolutionDepthWise conv2_2/dw 1 1 conv2_1/sep_relu2_1/sep conv2_2/dw_relu2_2/dw -23330=4,3,56,56,64 0=64 1=3 3=2 4=1 5=1 6=576 7=64 8=101 9=1 +Convolution conv2_2/sep 1 1 conv2_2/dw_relu2_2/dw conv2_2/sep_relu2_2/sep -23330=4,3,56,56,128 0=128 1=1 5=1 6=8192 8=102 9=1 +ConvolutionDepthWise conv3_1/dw 1 1 conv2_2/sep_relu2_2/sep conv3_1/dw_relu3_1/dw -23330=4,3,56,56,128 0=128 1=3 4=1 5=1 6=1152 7=128 8=101 9=1 +Convolution conv3_1/sep 1 1 conv3_1/dw_relu3_1/dw conv3_1/sep_relu3_1/sep -23330=4,3,56,56,128 0=128 1=1 5=1 6=16384 8=102 9=1 +ConvolutionDepthWise conv3_2/dw 1 1 conv3_1/sep_relu3_1/sep conv3_2/dw_relu3_2/dw -23330=4,3,28,28,128 0=128 1=3 3=2 4=1 5=1 6=1152 7=128 8=101 9=1 +Convolution conv3_2/sep 1 1 conv3_2/dw_relu3_2/dw conv3_2/sep_relu3_2/sep -23330=4,3,28,28,256 0=256 1=1 5=1 6=32768 8=102 9=1 +ConvolutionDepthWise conv4_1/dw 1 1 conv3_2/sep_relu3_2/sep conv4_1/dw_relu4_1/dw -23330=4,3,28,28,256 0=256 1=3 4=1 5=1 6=2304 7=256 8=101 9=1 +Convolution conv4_1/sep 1 1 conv4_1/dw_relu4_1/dw conv4_1/sep_relu4_1/sep -23330=4,3,28,28,256 0=256 1=1 5=1 6=65536 8=102 9=1 +ConvolutionDepthWise conv4_2/dw 1 1 conv4_1/sep_relu4_1/sep conv4_2/dw_relu4_2/dw -23330=4,3,14,14,256 0=256 1=3 3=2 4=1 5=1 6=2304 7=256 8=101 9=1 +Convolution conv4_2/sep 1 1 conv4_2/dw_relu4_2/dw conv4_2/sep_relu4_2/sep -23330=4,3,14,14,512 0=512 1=1 5=1 6=131072 8=102 9=1 +ConvolutionDepthWise conv5_1/dw 1 1 conv4_2/sep_relu4_2/sep conv5_1/dw_relu5_1/dw -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 +Convolution conv5_1/sep 1 1 conv5_1/dw_relu5_1/dw conv5_1/sep_relu5_1/sep -23330=4,3,14,14,512 0=512 1=1 5=1 6=262144 8=102 9=1 +ConvolutionDepthWise conv5_2/dw 1 1 conv5_1/sep_relu5_1/sep conv5_2/dw_relu5_2/dw -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 +Convolution conv5_2/sep 1 1 conv5_2/dw_relu5_2/dw conv5_2/sep_relu5_2/sep -23330=4,3,14,14,512 0=512 1=1 5=1 6=262144 8=102 9=1 +ConvolutionDepthWise conv5_3/dw 1 1 conv5_2/sep_relu5_2/sep conv5_3/dw_relu5_3/dw -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 +Convolution conv5_3/sep 1 1 conv5_3/dw_relu5_3/dw conv5_3/sep_relu5_3/sep -23330=4,3,14,14,512 0=512 1=1 5=1 6=262144 8=102 9=1 +ConvolutionDepthWise conv5_4/dw 1 1 conv5_3/sep_relu5_3/sep conv5_4/dw_relu5_4/dw -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 +Convolution conv5_4/sep 1 1 conv5_4/dw_relu5_4/dw conv5_4/sep_relu5_4/sep -23330=4,3,14,14,512 0=512 1=1 5=1 6=262144 8=102 9=1 +ConvolutionDepthWise conv5_5/dw 1 1 conv5_4/sep_relu5_4/sep conv5_5/dw_relu5_5/dw -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 +Convolution conv5_5/sep 1 1 conv5_5/dw_relu5_5/dw conv5_5/sep_relu5_5/sep -23330=4,3,14,14,512 0=512 1=1 5=1 6=262144 8=102 9=1 +ConvolutionDepthWise conv5_6/dw 1 1 conv5_5/sep_relu5_5/sep conv5_6/dw_relu5_6/dw -23330=4,3,7,7,512 0=512 1=3 3=2 4=1 5=1 6=4608 7=512 8=101 9=1 +Convolution conv5_6/sep 1 1 conv5_6/dw_relu5_6/dw conv5_6/sep_relu5_6/sep -23330=4,3,7,7,1024 0=1024 1=1 5=1 6=524288 8=102 9=1 +ConvolutionDepthWise conv6/dw 1 1 conv5_6/sep_relu5_6/sep conv6/dw_relu6/dw -23330=4,3,7,7,1024 0=1024 1=3 4=1 5=1 6=9216 7=1024 8=101 9=1 +Convolution conv6/sep 1 1 conv6/dw_relu6/dw conv6/sep_relu6/sep -23330=4,3,7,7,1024 0=1024 1=1 5=1 6=1048576 8=2 9=1 +Pooling pool6 1 1 conv6/sep_relu6/sep pool6 -23330=4,1,1024,1,1 0=1 4=1 +InnerProduct fc7 1 1 pool6 fc7 -23330=4,1,1000,1,1 0=1000 1=1 2=1024000 8=2 +Softmax prob 1 1 fc7 output -23330=4,1,1000,1,1 diff --git a/benchmark/models/mobilenet_ssd_int8.param b/benchmark/models/mobilenet_ssd_int8.param index 3b38cf04ad70..90460ef8decf 100644 --- a/benchmark/models/mobilenet_ssd_int8.param +++ b/benchmark/models/mobilenet_ssd_int8.param @@ -1,94 +1,94 @@ 7767517 92 115 -Input input 0 1 data 0=300 1=300 2=3 -Split splitncnn_0 1 7 data data_splitncnn_0 data_splitncnn_1 data_splitncnn_2 data_splitncnn_3 data_splitncnn_4 data_splitncnn_5 data_splitncnn_6 -Convolution conv0 1 1 data_splitncnn_6 conv0_conv0/relu 0=32 1=3 3=2 4=1 5=1 6=864 8=102 9=1 -ConvolutionDepthWise conv1/dw 1 1 conv0_conv0/relu conv1/dw_conv1/dw/relu 0=32 1=3 4=1 5=1 6=288 7=32 8=101 9=1 -Convolution conv1 1 1 conv1/dw_conv1/dw/relu conv1_conv1/relu 0=64 1=1 5=1 6=2048 8=102 9=1 -ConvolutionDepthWise conv2/dw 1 1 conv1_conv1/relu conv2/dw_conv2/dw/relu 0=64 1=3 3=2 4=1 5=1 6=576 7=64 8=101 9=1 -Convolution conv2 1 1 conv2/dw_conv2/dw/relu conv2_conv2/relu 0=128 1=1 5=1 6=8192 8=102 9=1 -ConvolutionDepthWise conv3/dw 1 1 conv2_conv2/relu conv3/dw_conv3/dw/relu 0=128 1=3 4=1 5=1 6=1152 7=128 8=101 9=1 -Convolution conv3 1 1 conv3/dw_conv3/dw/relu conv3_conv3/relu 0=128 1=1 5=1 6=16384 8=102 9=1 -ConvolutionDepthWise conv4/dw 1 1 conv3_conv3/relu conv4/dw_conv4/dw/relu 0=128 1=3 3=2 4=1 5=1 6=1152 7=128 8=101 9=1 -Convolution conv4 1 1 conv4/dw_conv4/dw/relu conv4_conv4/relu 0=256 1=1 5=1 6=32768 8=102 9=1 -ConvolutionDepthWise conv5/dw 1 1 conv4_conv4/relu conv5/dw_conv5/dw/relu 0=256 1=3 4=1 5=1 6=2304 7=256 8=101 9=1 -Convolution conv5 1 1 conv5/dw_conv5/dw/relu conv5_conv5/relu 0=256 1=1 5=1 6=65536 8=102 9=1 -ConvolutionDepthWise conv6/dw 1 1 conv5_conv5/relu conv6/dw_conv6/dw/relu 0=256 1=3 3=2 4=1 5=1 6=2304 7=256 8=101 9=1 -Convolution conv6 1 1 conv6/dw_conv6/dw/relu conv6_conv6/relu 0=512 1=1 5=1 6=131072 8=102 9=1 -ConvolutionDepthWise conv7/dw 1 1 conv6_conv6/relu conv7/dw_conv7/dw/relu 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 -Convolution conv7 1 1 conv7/dw_conv7/dw/relu conv7_conv7/relu 0=512 1=1 5=1 6=262144 8=102 9=1 -ConvolutionDepthWise conv8/dw 1 1 conv7_conv7/relu conv8/dw_conv8/dw/relu 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 -Convolution conv8 1 1 conv8/dw_conv8/dw/relu conv8_conv8/relu 0=512 1=1 5=1 6=262144 8=102 9=1 -ConvolutionDepthWise conv9/dw 1 1 conv8_conv8/relu conv9/dw_conv9/dw/relu 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 -Convolution conv9 1 1 conv9/dw_conv9/dw/relu conv9_conv9/relu 0=512 1=1 5=1 6=262144 8=102 9=1 -ConvolutionDepthWise conv10/dw 1 1 conv9_conv9/relu conv10/dw_conv10/dw/relu 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 -Convolution conv10 1 1 conv10/dw_conv10/dw/relu conv10_conv10/relu 0=512 1=1 5=1 6=262144 8=102 9=1 -ConvolutionDepthWise conv11/dw 1 1 conv10_conv10/relu conv11/dw_conv11/dw/relu 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 -Convolution conv11 1 1 conv11/dw_conv11/dw/relu conv11_conv11/relu 0=512 1=1 5=1 6=262144 8=2 9=1 -Split splitncnn_1 1 4 conv11_conv11/relu conv11_conv11/relu_splitncnn_0 conv11_conv11/relu_splitncnn_1 conv11_conv11/relu_splitncnn_2 conv11_conv11/relu_splitncnn_3 -ConvolutionDepthWise conv12/dw 1 1 conv11_conv11/relu_splitncnn_3 conv12/dw_conv12/dw/relu 0=512 1=3 3=2 4=1 5=1 6=4608 7=512 8=101 9=1 -Convolution conv12 1 1 conv12/dw_conv12/dw/relu conv12_conv12/relu 0=1024 1=1 5=1 6=524288 8=102 9=1 -ConvolutionDepthWise conv13/dw 1 1 conv12_conv12/relu conv13/dw_conv13/dw/relu 0=1024 1=3 4=1 5=1 6=9216 7=1024 8=101 9=1 -Convolution conv13 1 1 conv13/dw_conv13/dw/relu conv13_conv13/relu 0=1024 1=1 5=1 6=1048576 8=2 9=1 -Split splitncnn_2 1 4 conv13_conv13/relu conv13_conv13/relu_splitncnn_0 conv13_conv13/relu_splitncnn_1 conv13_conv13/relu_splitncnn_2 conv13_conv13/relu_splitncnn_3 -Convolution conv14_1 1 1 conv13_conv13/relu_splitncnn_3 conv14_1_conv14_1/relu 0=256 1=1 5=1 6=262144 8=102 9=1 -Convolution conv14_2 1 1 conv14_1_conv14_1/relu conv14_2_conv14_2/relu 0=512 1=3 3=2 4=1 5=1 6=1179648 8=2 9=1 -Split splitncnn_3 1 4 conv14_2_conv14_2/relu conv14_2_conv14_2/relu_splitncnn_0 conv14_2_conv14_2/relu_splitncnn_1 conv14_2_conv14_2/relu_splitncnn_2 conv14_2_conv14_2/relu_splitncnn_3 -Convolution conv15_1 1 1 conv14_2_conv14_2/relu_splitncnn_3 conv15_1_conv15_1/relu 0=128 1=1 5=1 6=65536 8=102 9=1 -Convolution conv15_2 1 1 conv15_1_conv15_1/relu conv15_2_conv15_2/relu 0=256 1=3 3=2 4=1 5=1 6=294912 8=2 9=1 -Split splitncnn_4 1 4 conv15_2_conv15_2/relu conv15_2_conv15_2/relu_splitncnn_0 conv15_2_conv15_2/relu_splitncnn_1 conv15_2_conv15_2/relu_splitncnn_2 conv15_2_conv15_2/relu_splitncnn_3 -Convolution conv16_1 1 1 conv15_2_conv15_2/relu_splitncnn_3 conv16_1_conv16_1/relu 0=128 1=1 5=1 6=32768 8=102 9=1 -Convolution conv16_2 1 1 conv16_1_conv16_1/relu conv16_2_conv16_2/relu 0=256 1=3 3=2 4=1 5=1 6=294912 8=2 9=1 -Split splitncnn_5 1 4 conv16_2_conv16_2/relu conv16_2_conv16_2/relu_splitncnn_0 conv16_2_conv16_2/relu_splitncnn_1 conv16_2_conv16_2/relu_splitncnn_2 conv16_2_conv16_2/relu_splitncnn_3 -Convolution conv17_1 1 1 conv16_2_conv16_2/relu_splitncnn_3 conv17_1_conv17_1/relu 0=64 1=1 5=1 6=16384 8=102 9=1 -Convolution conv17_2 1 1 conv17_1_conv17_1/relu conv17_2_conv17_2/relu 0=128 1=3 3=2 4=1 5=1 6=73728 8=2 9=1 -Split splitncnn_6 1 3 conv17_2_conv17_2/relu conv17_2_conv17_2/relu_splitncnn_0 conv17_2_conv17_2/relu_splitncnn_1 conv17_2_conv17_2/relu_splitncnn_2 -Convolution conv11_mbox_loc 1 1 conv11_conv11/relu_splitncnn_2 conv11_mbox_loc 0=12 1=1 5=1 6=6144 8=2 -Permute conv11_mbox_loc_perm 1 1 conv11_mbox_loc conv11_mbox_loc_perm 0=3 -Flatten conv11_mbox_loc_flat 1 1 conv11_mbox_loc_perm conv11_mbox_loc_flat -Convolution conv11_mbox_conf 1 1 conv11_conv11/relu_splitncnn_1 conv11_mbox_conf 0=63 1=1 5=1 6=32256 8=2 -Permute conv11_mbox_conf_perm 1 1 conv11_mbox_conf conv11_mbox_conf_perm 0=3 -Flatten conv11_mbox_conf_flat 1 1 conv11_mbox_conf_perm conv11_mbox_conf_flat -PriorBox conv11_mbox_priorbox 2 1 conv11_conv11/relu_splitncnn_0 data_splitncnn_5 conv11_mbox_priorbox -23300=1,60.000000 -23302=1,2.000000 9=-233 10=-233 13=0.500000 -Convolution conv13_mbox_loc 1 1 conv13_conv13/relu_splitncnn_2 conv13_mbox_loc 0=24 1=1 5=1 6=24576 8=2 -Permute conv13_mbox_loc_perm 1 1 conv13_mbox_loc conv13_mbox_loc_perm 0=3 -Flatten conv13_mbox_loc_flat 1 1 conv13_mbox_loc_perm conv13_mbox_loc_flat -Convolution conv13_mbox_conf 1 1 conv13_conv13/relu_splitncnn_1 conv13_mbox_conf 0=126 1=1 5=1 6=129024 8=2 -Permute conv13_mbox_conf_perm 1 1 conv13_mbox_conf conv13_mbox_conf_perm 0=3 -Flatten conv13_mbox_conf_flat 1 1 conv13_mbox_conf_perm conv13_mbox_conf_flat -PriorBox conv13_mbox_priorbox 2 1 conv13_conv13/relu_splitncnn_0 data_splitncnn_4 conv13_mbox_priorbox -23300=1,105.000000 -23301=1,150.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000 -Convolution conv14_2_mbox_loc 1 1 conv14_2_conv14_2/relu_splitncnn_2 conv14_2_mbox_loc 0=24 1=1 5=1 6=12288 8=2 -Permute conv14_2_mbox_loc_perm 1 1 conv14_2_mbox_loc conv14_2_mbox_loc_perm 0=3 -Flatten conv14_2_mbox_loc_flat 1 1 conv14_2_mbox_loc_perm conv14_2_mbox_loc_flat -Convolution conv14_2_mbox_conf 1 1 conv14_2_conv14_2/relu_splitncnn_1 conv14_2_mbox_conf 0=126 1=1 5=1 6=64512 8=2 -Permute conv14_2_mbox_conf_perm 1 1 conv14_2_mbox_conf conv14_2_mbox_conf_perm 0=3 -Flatten conv14_2_mbox_conf_flat 1 1 conv14_2_mbox_conf_perm conv14_2_mbox_conf_flat -PriorBox conv14_2_mbox_priorbox 2 1 conv14_2_conv14_2/relu_splitncnn_0 data_splitncnn_3 conv14_2_mbox_priorbox -23300=1,150.000000 -23301=1,195.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000 -Convolution conv15_2_mbox_loc 1 1 conv15_2_conv15_2/relu_splitncnn_2 conv15_2_mbox_loc 0=24 1=1 5=1 6=6144 8=2 -Permute conv15_2_mbox_loc_perm 1 1 conv15_2_mbox_loc conv15_2_mbox_loc_perm 0=3 -Flatten conv15_2_mbox_loc_flat 1 1 conv15_2_mbox_loc_perm conv15_2_mbox_loc_flat -Convolution conv15_2_mbox_conf 1 1 conv15_2_conv15_2/relu_splitncnn_1 conv15_2_mbox_conf 0=126 1=1 5=1 6=32256 8=2 -Permute conv15_2_mbox_conf_perm 1 1 conv15_2_mbox_conf conv15_2_mbox_conf_perm 0=3 -Flatten conv15_2_mbox_conf_flat 1 1 conv15_2_mbox_conf_perm conv15_2_mbox_conf_flat -PriorBox conv15_2_mbox_priorbox 2 1 conv15_2_conv15_2/relu_splitncnn_0 data_splitncnn_2 conv15_2_mbox_priorbox -23300=1,195.000000 -23301=1,240.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000 -Convolution conv16_2_mbox_loc 1 1 conv16_2_conv16_2/relu_splitncnn_2 conv16_2_mbox_loc 0=24 1=1 5=1 6=6144 8=2 -Permute conv16_2_mbox_loc_perm 1 1 conv16_2_mbox_loc conv16_2_mbox_loc_perm 0=3 -Flatten conv16_2_mbox_loc_flat 1 1 conv16_2_mbox_loc_perm conv16_2_mbox_loc_flat -Convolution conv16_2_mbox_conf 1 1 conv16_2_conv16_2/relu_splitncnn_1 conv16_2_mbox_conf 0=126 1=1 5=1 6=32256 8=2 -Permute conv16_2_mbox_conf_perm 1 1 conv16_2_mbox_conf conv16_2_mbox_conf_perm 0=3 -Flatten conv16_2_mbox_conf_flat 1 1 conv16_2_mbox_conf_perm conv16_2_mbox_conf_flat -PriorBox conv16_2_mbox_priorbox 2 1 conv16_2_conv16_2/relu_splitncnn_0 data_splitncnn_1 conv16_2_mbox_priorbox -23300=1,240.000000 -23301=1,285.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000 -Convolution conv17_2_mbox_loc 1 1 conv17_2_conv17_2/relu_splitncnn_2 conv17_2_mbox_loc 0=24 1=1 5=1 6=3072 8=2 -Permute conv17_2_mbox_loc_perm 1 1 conv17_2_mbox_loc conv17_2_mbox_loc_perm 0=3 -Flatten conv17_2_mbox_loc_flat 1 1 conv17_2_mbox_loc_perm conv17_2_mbox_loc_flat -Convolution conv17_2_mbox_conf 1 1 conv17_2_conv17_2/relu_splitncnn_1 conv17_2_mbox_conf 0=126 1=1 5=1 6=16128 8=2 -Permute conv17_2_mbox_conf_perm 1 1 conv17_2_mbox_conf conv17_2_mbox_conf_perm 0=3 -Flatten conv17_2_mbox_conf_flat 1 1 conv17_2_mbox_conf_perm conv17_2_mbox_conf_flat -PriorBox conv17_2_mbox_priorbox 2 1 conv17_2_conv17_2/relu_splitncnn_0 data_splitncnn_0 conv17_2_mbox_priorbox -23300=1,285.000000 -23301=1,300.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000 -Concat mbox_loc 6 1 conv11_mbox_loc_flat conv13_mbox_loc_flat conv14_2_mbox_loc_flat conv15_2_mbox_loc_flat conv16_2_mbox_loc_flat conv17_2_mbox_loc_flat mbox_loc -Concat mbox_conf 6 1 conv11_mbox_conf_flat conv13_mbox_conf_flat conv14_2_mbox_conf_flat conv15_2_mbox_conf_flat conv16_2_mbox_conf_flat conv17_2_mbox_conf_flat mbox_conf -Concat mbox_priorbox 6 1 conv11_mbox_priorbox conv13_mbox_priorbox conv14_2_mbox_priorbox conv15_2_mbox_priorbox conv16_2_mbox_priorbox conv17_2_mbox_priorbox mbox_priorbox 0=1 -Reshape mbox_conf_reshape 1 1 mbox_conf mbox_conf_reshape 0=21 1=-1 -Softmax mbox_conf_softmax 1 1 mbox_conf_reshape mbox_conf_softmax 0=1 1=1 -Flatten mbox_conf_flatten 1 1 mbox_conf_softmax mbox_conf_flatten +Input input 0 1 data -23330=4,3,300,300,3 0=300 1=300 2=3 +Split splitncnn_0 1 7 data data_splitncnn_0 data_splitncnn_1 data_splitncnn_2 data_splitncnn_3 data_splitncnn_4 data_splitncnn_5 data_splitncnn_6 -23330=28,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3 +Convolution conv0 1 1 data_splitncnn_6 conv0_conv0/relu -23330=4,3,150,150,32 0=32 1=3 3=2 4=1 5=1 6=864 8=102 9=1 +ConvolutionDepthWise conv1/dw 1 1 conv0_conv0/relu conv1/dw_conv1/dw/relu -23330=4,3,150,150,32 0=32 1=3 4=1 5=1 6=288 7=32 8=101 9=1 +Convolution conv1 1 1 conv1/dw_conv1/dw/relu conv1_conv1/relu -23330=4,3,150,150,64 0=64 1=1 5=1 6=2048 8=102 9=1 +ConvolutionDepthWise conv2/dw 1 1 conv1_conv1/relu conv2/dw_conv2/dw/relu -23330=4,3,75,75,64 0=64 1=3 3=2 4=1 5=1 6=576 7=64 8=101 9=1 +Convolution conv2 1 1 conv2/dw_conv2/dw/relu conv2_conv2/relu -23330=4,3,75,75,128 0=128 1=1 5=1 6=8192 8=102 9=1 +ConvolutionDepthWise conv3/dw 1 1 conv2_conv2/relu conv3/dw_conv3/dw/relu -23330=4,3,75,75,128 0=128 1=3 4=1 5=1 6=1152 7=128 8=101 9=1 +Convolution conv3 1 1 conv3/dw_conv3/dw/relu conv3_conv3/relu -23330=4,3,75,75,128 0=128 1=1 5=1 6=16384 8=102 9=1 +ConvolutionDepthWise conv4/dw 1 1 conv3_conv3/relu conv4/dw_conv4/dw/relu -23330=4,3,38,38,128 0=128 1=3 3=2 4=1 5=1 6=1152 7=128 8=101 9=1 +Convolution conv4 1 1 conv4/dw_conv4/dw/relu conv4_conv4/relu -23330=4,3,38,38,256 0=256 1=1 5=1 6=32768 8=102 9=1 +ConvolutionDepthWise conv5/dw 1 1 conv4_conv4/relu conv5/dw_conv5/dw/relu -23330=4,3,38,38,256 0=256 1=3 4=1 5=1 6=2304 7=256 8=101 9=1 +Convolution conv5 1 1 conv5/dw_conv5/dw/relu conv5_conv5/relu -23330=4,3,38,38,256 0=256 1=1 5=1 6=65536 8=102 9=1 +ConvolutionDepthWise conv6/dw 1 1 conv5_conv5/relu conv6/dw_conv6/dw/relu -23330=4,3,19,19,256 0=256 1=3 3=2 4=1 5=1 6=2304 7=256 8=101 9=1 +Convolution conv6 1 1 conv6/dw_conv6/dw/relu conv6_conv6/relu -23330=4,3,19,19,512 0=512 1=1 5=1 6=131072 8=102 9=1 +ConvolutionDepthWise conv7/dw 1 1 conv6_conv6/relu conv7/dw_conv7/dw/relu -23330=4,3,19,19,512 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 +Convolution conv7 1 1 conv7/dw_conv7/dw/relu conv7_conv7/relu -23330=4,3,19,19,512 0=512 1=1 5=1 6=262144 8=102 9=1 +ConvolutionDepthWise conv8/dw 1 1 conv7_conv7/relu conv8/dw_conv8/dw/relu -23330=4,3,19,19,512 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 +Convolution conv8 1 1 conv8/dw_conv8/dw/relu conv8_conv8/relu -23330=4,3,19,19,512 0=512 1=1 5=1 6=262144 8=102 9=1 +ConvolutionDepthWise conv9/dw 1 1 conv8_conv8/relu conv9/dw_conv9/dw/relu -23330=4,3,19,19,512 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 +Convolution conv9 1 1 conv9/dw_conv9/dw/relu conv9_conv9/relu -23330=4,3,19,19,512 0=512 1=1 5=1 6=262144 8=102 9=1 +ConvolutionDepthWise conv10/dw 1 1 conv9_conv9/relu conv10/dw_conv10/dw/relu -23330=4,3,19,19,512 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 +Convolution conv10 1 1 conv10/dw_conv10/dw/relu conv10_conv10/relu -23330=4,3,19,19,512 0=512 1=1 5=1 6=262144 8=102 9=1 +ConvolutionDepthWise conv11/dw 1 1 conv10_conv10/relu conv11/dw_conv11/dw/relu -23330=4,3,19,19,512 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1 +Convolution conv11 1 1 conv11/dw_conv11/dw/relu conv11_conv11/relu -23330=4,3,19,19,512 0=512 1=1 5=1 6=262144 8=2 9=1 +Split splitncnn_1 1 4 conv11_conv11/relu conv11_conv11/relu_splitncnn_0 conv11_conv11/relu_splitncnn_1 conv11_conv11/relu_splitncnn_2 conv11_conv11/relu_splitncnn_3 -23330=16,3,19,19,512,3,19,19,512,3,19,19,512,3,19,19,512 +ConvolutionDepthWise conv12/dw 1 1 conv11_conv11/relu_splitncnn_3 conv12/dw_conv12/dw/relu -23330=4,3,10,10,512 0=512 1=3 3=2 4=1 5=1 6=4608 7=512 8=101 9=1 +Convolution conv12 1 1 conv12/dw_conv12/dw/relu conv12_conv12/relu -23330=4,3,10,10,1024 0=1024 1=1 5=1 6=524288 8=102 9=1 +ConvolutionDepthWise conv13/dw 1 1 conv12_conv12/relu conv13/dw_conv13/dw/relu -23330=4,3,10,10,1024 0=1024 1=3 4=1 5=1 6=9216 7=1024 8=101 9=1 +Convolution conv13 1 1 conv13/dw_conv13/dw/relu conv13_conv13/relu -23330=4,3,10,10,1024 0=1024 1=1 5=1 6=1048576 8=2 9=1 +Split splitncnn_2 1 4 conv13_conv13/relu conv13_conv13/relu_splitncnn_0 conv13_conv13/relu_splitncnn_1 conv13_conv13/relu_splitncnn_2 conv13_conv13/relu_splitncnn_3 -23330=16,3,10,10,1024,3,10,10,1024,3,10,10,1024,3,10,10,1024 +Convolution conv14_1 1 1 conv13_conv13/relu_splitncnn_3 conv14_1_conv14_1/relu -23330=4,3,10,10,256 0=256 1=1 5=1 6=262144 8=102 9=1 +Convolution conv14_2 1 1 conv14_1_conv14_1/relu conv14_2_conv14_2/relu -23330=4,3,5,5,512 0=512 1=3 3=2 4=1 5=1 6=1179648 8=2 9=1 +Split splitncnn_3 1 4 conv14_2_conv14_2/relu conv14_2_conv14_2/relu_splitncnn_0 conv14_2_conv14_2/relu_splitncnn_1 conv14_2_conv14_2/relu_splitncnn_2 conv14_2_conv14_2/relu_splitncnn_3 -23330=16,3,5,5,512,3,5,5,512,3,5,5,512,3,5,5,512 +Convolution conv15_1 1 1 conv14_2_conv14_2/relu_splitncnn_3 conv15_1_conv15_1/relu -23330=4,3,5,5,128 0=128 1=1 5=1 6=65536 8=102 9=1 +Convolution conv15_2 1 1 conv15_1_conv15_1/relu conv15_2_conv15_2/relu -23330=4,3,3,3,256 0=256 1=3 3=2 4=1 5=1 6=294912 8=2 9=1 +Split splitncnn_4 1 4 conv15_2_conv15_2/relu conv15_2_conv15_2/relu_splitncnn_0 conv15_2_conv15_2/relu_splitncnn_1 conv15_2_conv15_2/relu_splitncnn_2 conv15_2_conv15_2/relu_splitncnn_3 -23330=16,3,3,3,256,3,3,3,256,3,3,3,256,3,3,3,256 +Convolution conv16_1 1 1 conv15_2_conv15_2/relu_splitncnn_3 conv16_1_conv16_1/relu -23330=4,3,3,3,128 0=128 1=1 5=1 6=32768 8=102 9=1 +Convolution conv16_2 1 1 conv16_1_conv16_1/relu conv16_2_conv16_2/relu -23330=4,3,2,2,256 0=256 1=3 3=2 4=1 5=1 6=294912 8=2 9=1 +Split splitncnn_5 1 4 conv16_2_conv16_2/relu conv16_2_conv16_2/relu_splitncnn_0 conv16_2_conv16_2/relu_splitncnn_1 conv16_2_conv16_2/relu_splitncnn_2 conv16_2_conv16_2/relu_splitncnn_3 -23330=16,3,2,2,256,3,2,2,256,3,2,2,256,3,2,2,256 +Convolution conv17_1 1 1 conv16_2_conv16_2/relu_splitncnn_3 conv17_1_conv17_1/relu -23330=4,3,2,2,64 0=64 1=1 5=1 6=16384 8=102 9=1 +Convolution conv17_2 1 1 conv17_1_conv17_1/relu conv17_2_conv17_2/relu -23330=4,3,1,1,128 0=128 1=3 3=2 4=1 5=1 6=73728 8=2 9=1 +Split splitncnn_6 1 3 conv17_2_conv17_2/relu conv17_2_conv17_2/relu_splitncnn_0 conv17_2_conv17_2/relu_splitncnn_1 conv17_2_conv17_2/relu_splitncnn_2 -23330=12,3,1,1,128,3,1,1,128,3,1,1,128 +Convolution conv11_mbox_loc 1 1 conv11_conv11/relu_splitncnn_2 conv11_mbox_loc -23330=4,3,19,19,12 0=12 1=1 5=1 6=6144 8=2 +Permute conv11_mbox_loc_perm 1 1 conv11_mbox_loc conv11_mbox_loc_perm -23330=4,3,12,19,19 0=3 +Flatten conv11_mbox_loc_flat 1 1 conv11_mbox_loc_perm conv11_mbox_loc_flat -23330=4,1,4332,1,1 +Convolution conv11_mbox_conf 1 1 conv11_conv11/relu_splitncnn_1 conv11_mbox_conf -23330=4,3,19,19,63 0=63 1=1 5=1 6=32256 8=2 +Permute conv11_mbox_conf_perm 1 1 conv11_mbox_conf conv11_mbox_conf_perm -23330=4,3,63,19,19 0=3 +Flatten conv11_mbox_conf_flat 1 1 conv11_mbox_conf_perm conv11_mbox_conf_flat -23330=4,1,22743,1,1 +PriorBox conv11_mbox_priorbox 2 1 conv11_conv11/relu_splitncnn_0 data_splitncnn_5 conv11_mbox_priorbox -23330=4,2,4332,2,1 -23300=1,60.000000 -23302=1,2.000000 9=-233 10=-233 13=0.500000 +Convolution conv13_mbox_loc 1 1 conv13_conv13/relu_splitncnn_2 conv13_mbox_loc -23330=4,3,10,10,24 0=24 1=1 5=1 6=24576 8=2 +Permute conv13_mbox_loc_perm 1 1 conv13_mbox_loc conv13_mbox_loc_perm -23330=4,3,24,10,10 0=3 +Flatten conv13_mbox_loc_flat 1 1 conv13_mbox_loc_perm conv13_mbox_loc_flat -23330=4,1,2400,1,1 +Convolution conv13_mbox_conf 1 1 conv13_conv13/relu_splitncnn_1 conv13_mbox_conf -23330=4,3,10,10,126 0=126 1=1 5=1 6=129024 8=2 +Permute conv13_mbox_conf_perm 1 1 conv13_mbox_conf conv13_mbox_conf_perm -23330=4,3,126,10,10 0=3 +Flatten conv13_mbox_conf_flat 1 1 conv13_mbox_conf_perm conv13_mbox_conf_flat -23330=4,1,12600,1,1 +PriorBox conv13_mbox_priorbox 2 1 conv13_conv13/relu_splitncnn_0 data_splitncnn_4 conv13_mbox_priorbox -23330=4,2,2400,2,1 -23300=1,105.000000 -23301=1,150.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000 +Convolution conv14_2_mbox_loc 1 1 conv14_2_conv14_2/relu_splitncnn_2 conv14_2_mbox_loc -23330=4,3,5,5,24 0=24 1=1 5=1 6=12288 8=2 +Permute conv14_2_mbox_loc_perm 1 1 conv14_2_mbox_loc conv14_2_mbox_loc_perm -23330=4,3,24,5,5 0=3 +Flatten conv14_2_mbox_loc_flat 1 1 conv14_2_mbox_loc_perm conv14_2_mbox_loc_flat -23330=4,1,600,1,1 +Convolution conv14_2_mbox_conf 1 1 conv14_2_conv14_2/relu_splitncnn_1 conv14_2_mbox_conf -23330=4,3,5,5,126 0=126 1=1 5=1 6=64512 8=2 +Permute conv14_2_mbox_conf_perm 1 1 conv14_2_mbox_conf conv14_2_mbox_conf_perm -23330=4,3,126,5,5 0=3 +Flatten conv14_2_mbox_conf_flat 1 1 conv14_2_mbox_conf_perm conv14_2_mbox_conf_flat -23330=4,1,3150,1,1 +PriorBox conv14_2_mbox_priorbox 2 1 conv14_2_conv14_2/relu_splitncnn_0 data_splitncnn_3 conv14_2_mbox_priorbox -23330=4,2,600,2,1 -23300=1,150.000000 -23301=1,195.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000 +Convolution conv15_2_mbox_loc 1 1 conv15_2_conv15_2/relu_splitncnn_2 conv15_2_mbox_loc -23330=4,3,3,3,24 0=24 1=1 5=1 6=6144 8=2 +Permute conv15_2_mbox_loc_perm 1 1 conv15_2_mbox_loc conv15_2_mbox_loc_perm -23330=4,3,24,3,3 0=3 +Flatten conv15_2_mbox_loc_flat 1 1 conv15_2_mbox_loc_perm conv15_2_mbox_loc_flat -23330=4,1,216,1,1 +Convolution conv15_2_mbox_conf 1 1 conv15_2_conv15_2/relu_splitncnn_1 conv15_2_mbox_conf -23330=4,3,3,3,126 0=126 1=1 5=1 6=32256 8=2 +Permute conv15_2_mbox_conf_perm 1 1 conv15_2_mbox_conf conv15_2_mbox_conf_perm -23330=4,3,126,3,3 0=3 +Flatten conv15_2_mbox_conf_flat 1 1 conv15_2_mbox_conf_perm conv15_2_mbox_conf_flat -23330=4,1,1134,1,1 +PriorBox conv15_2_mbox_priorbox 2 1 conv15_2_conv15_2/relu_splitncnn_0 data_splitncnn_2 conv15_2_mbox_priorbox -23330=4,2,216,2,1 -23300=1,195.000000 -23301=1,240.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000 +Convolution conv16_2_mbox_loc 1 1 conv16_2_conv16_2/relu_splitncnn_2 conv16_2_mbox_loc -23330=4,3,2,2,24 0=24 1=1 5=1 6=6144 8=2 +Permute conv16_2_mbox_loc_perm 1 1 conv16_2_mbox_loc conv16_2_mbox_loc_perm -23330=4,3,24,2,2 0=3 +Flatten conv16_2_mbox_loc_flat 1 1 conv16_2_mbox_loc_perm conv16_2_mbox_loc_flat -23330=4,1,96,1,1 +Convolution conv16_2_mbox_conf 1 1 conv16_2_conv16_2/relu_splitncnn_1 conv16_2_mbox_conf -23330=4,3,2,2,126 0=126 1=1 5=1 6=32256 8=2 +Permute conv16_2_mbox_conf_perm 1 1 conv16_2_mbox_conf conv16_2_mbox_conf_perm -23330=4,3,126,2,2 0=3 +Flatten conv16_2_mbox_conf_flat 1 1 conv16_2_mbox_conf_perm conv16_2_mbox_conf_flat -23330=4,1,504,1,1 +PriorBox conv16_2_mbox_priorbox 2 1 conv16_2_conv16_2/relu_splitncnn_0 data_splitncnn_1 conv16_2_mbox_priorbox -23330=4,2,96,2,1 -23300=1,240.000000 -23301=1,285.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000 +Convolution conv17_2_mbox_loc 1 1 conv17_2_conv17_2/relu_splitncnn_2 conv17_2_mbox_loc -23330=4,3,1,1,24 0=24 1=1 5=1 6=3072 8=2 +Permute conv17_2_mbox_loc_perm 1 1 conv17_2_mbox_loc conv17_2_mbox_loc_perm -23330=4,3,24,1,1 0=3 +Flatten conv17_2_mbox_loc_flat 1 1 conv17_2_mbox_loc_perm conv17_2_mbox_loc_flat -23330=4,1,24,1,1 +Convolution conv17_2_mbox_conf 1 1 conv17_2_conv17_2/relu_splitncnn_1 conv17_2_mbox_conf -23330=4,3,1,1,126 0=126 1=1 5=1 6=16128 8=2 +Permute conv17_2_mbox_conf_perm 1 1 conv17_2_mbox_conf conv17_2_mbox_conf_perm -23330=4,3,126,1,1 0=3 +Flatten conv17_2_mbox_conf_flat 1 1 conv17_2_mbox_conf_perm conv17_2_mbox_conf_flat -23330=4,1,126,1,1 +PriorBox conv17_2_mbox_priorbox 2 1 conv17_2_conv17_2/relu_splitncnn_0 data_splitncnn_0 conv17_2_mbox_priorbox -23330=4,2,24,2,1 -23300=1,285.000000 -23301=1,300.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000 +Concat mbox_loc 6 1 conv11_mbox_loc_flat conv13_mbox_loc_flat conv14_2_mbox_loc_flat conv15_2_mbox_loc_flat conv16_2_mbox_loc_flat conv17_2_mbox_loc_flat mbox_loc -23330=4,1,7668,1,1 +Concat mbox_conf 6 1 conv11_mbox_conf_flat conv13_mbox_conf_flat conv14_2_mbox_conf_flat conv15_2_mbox_conf_flat conv16_2_mbox_conf_flat conv17_2_mbox_conf_flat mbox_conf -23330=4,1,40257,1,1 +Concat mbox_priorbox 6 1 conv11_mbox_priorbox conv13_mbox_priorbox conv14_2_mbox_priorbox conv15_2_mbox_priorbox conv16_2_mbox_priorbox conv17_2_mbox_priorbox mbox_priorbox -23330=4,2,7668,2,1 0=1 +Reshape mbox_conf_reshape 1 1 mbox_conf mbox_conf_reshape -23330=4,2,21,1917,1 0=21 1=-1 +Softmax mbox_conf_softmax 1 1 mbox_conf_reshape mbox_conf_softmax -23330=4,2,21,1917,1 0=1 1=1 +Flatten mbox_conf_flatten 1 1 mbox_conf_softmax mbox_conf_flatten -23330=4,1,40257,1,1 DetectionOutput detection_out 3 1 mbox_loc mbox_conf_flatten mbox_priorbox output 0=21 1=0.450000 2=100 4=0.250000 diff --git a/benchmark/models/resnet18_int8.param b/benchmark/models/resnet18_int8.param index a546b5fdf2fe..ed42e90e9d33 100644 --- a/benchmark/models/resnet18_int8.param +++ b/benchmark/models/resnet18_int8.param @@ -1,52 +1,52 @@ 7767517 50 58 -Input data 0 1 data 0=224 1=224 2=3 -Convolution conv1 1 1 data conv1_conv1_relu 0=64 1=7 3=2 4=3 5=1 6=9408 8=2 9=1 -Pooling pool1 1 1 conv1_conv1_relu pool1 1=3 2=2 -Split splitncnn_0 1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1 -Convolution res2a_branch1 1 1 pool1_splitncnn_1 res2a_branch1_scale2a_branch1 0=64 1=1 5=1 6=4096 8=2 -Convolution res2a_branch2a 1 1 pool1_splitncnn_0 res2a_branch2a_res2a_branch2a_relu 0=64 1=3 4=1 5=1 6=36864 8=102 9=1 -Convolution res2a_branch2b 1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b_scale2a_branch2b 0=64 1=3 4=1 5=1 6=36864 8=2 -Eltwise res2a 2 1 res2a_branch1_scale2a_branch1 res2a_branch2b_scale2a_branch2b res2a 0=1 -ReLU res2a_relu 1 1 res2a res2a_res2a_relu -Split splitncnn_1 1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1 -Convolution res2b_branch2a 1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a_res2b_branch2a_relu 0=64 1=3 4=1 5=1 6=36864 8=102 9=1 -Convolution res2b_branch2b 1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b_scale2b_branch2b 0=64 1=3 4=1 5=1 6=36864 8=2 -Eltwise res2b 2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2b_scale2b_branch2b res2b 0=1 -ReLU res2b_relu 1 1 res2b res2b_res2b_relu -Split splitncnn_2 1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1 -Convolution res3a_branch1 1 1 res2b_res2b_relu_splitncnn_1 res3a_branch1_scale3a_branch1 0=128 1=1 3=2 5=1 6=8192 8=2 -Convolution res3a_branch2a 1 1 res2b_res2b_relu_splitncnn_0 res3a_branch2a_res3a_branch2a_relu 0=128 1=3 3=2 4=1 5=1 6=73728 8=102 9=1 -Convolution res3a_branch2b 1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b_scale3a_branch2b 0=128 1=3 4=1 5=1 6=147456 8=2 -Eltwise res3a 2 1 res3a_branch1_scale3a_branch1 res3a_branch2b_scale3a_branch2b res3a 0=1 -ReLU res3a_relu 1 1 res3a res3a_res3a_relu -Split splitncnn_3 1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1 -Convolution res3b_branch2a 1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a_res3b_branch2a_relu 0=128 1=3 4=1 5=1 6=147456 8=102 9=1 -Convolution res3b_branch2b 1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b_scale3b_branch2b 0=128 1=3 4=1 5=1 6=147456 8=2 -Eltwise res3b 2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2b_scale3b_branch2b res3b 0=1 -ReLU res3b_relu 1 1 res3b res3b_res3b_relu -Split splitncnn_4 1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1 -Convolution res4a_branch1 1 1 res3b_res3b_relu_splitncnn_1 res4a_branch1_scale4a_branch1 0=256 1=1 3=2 5=1 6=32768 8=2 -Convolution res4a_branch2a 1 1 res3b_res3b_relu_splitncnn_0 res4a_branch2a_res4a_branch2a_relu 0=256 1=3 3=2 4=1 5=1 6=294912 8=102 9=1 -Convolution res4a_branch2b 1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b_scale4a_branch2b 0=256 1=3 4=1 5=1 6=589824 8=2 -Eltwise res4a 2 1 res4a_branch1_scale4a_branch1 res4a_branch2b_scale4a_branch2b res4a 0=1 -ReLU res4a_relu 1 1 res4a res4a_res4a_relu -Split splitncnn_5 1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1 -Convolution res4b_branch2a 1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a_res4b_branch2a_relu 0=256 1=3 4=1 5=1 6=589824 8=102 9=1 -Convolution res4b_branch2b 1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b_scale4b_branch2b 0=256 1=3 4=1 5=1 6=589824 8=2 -Eltwise res4b 2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2b_scale4b_branch2b res4b 0=1 -ReLU res4b_relu 1 1 res4b res4b_res4b_relu -Split splitncnn_6 1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1 -Convolution res5a_branch1 1 1 res4b_res4b_relu_splitncnn_1 res5a_branch1_scale5a_branch1 0=512 1=1 3=2 5=1 6=131072 8=2 -Convolution res5a_branch2a 1 1 res4b_res4b_relu_splitncnn_0 res5a_branch2a_res5a_branch2a_relu 0=512 1=3 3=2 4=1 5=1 6=1179648 8=102 9=1 -Convolution res5a_branch2b 1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b_scale5a_branch2b 0=512 1=3 4=1 5=1 6=2359296 8=2 -Eltwise res5a 2 1 res5a_branch1_scale5a_branch1 res5a_branch2b_scale5a_branch2b res5a 0=1 -ReLU res5a_relu 1 1 res5a res5a_res5a_relu -Split splitncnn_7 1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1 -Convolution res5b_branch2a 1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a_res5b_branch2a_relu 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1 -Convolution res5b_branch2b 1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b_scale5b_branch2b 0=512 1=3 4=1 5=1 6=2359296 8=2 -Eltwise res5b 2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2b_scale5b_branch2b res5b 0=1 -ReLU res5b_relu 1 1 res5b res5b_res5b_relu -Pooling pool5 1 1 res5b_res5b_relu pool5 0=1 1=7 -InnerProduct fc1000 1 1 pool5 fc1000 0=1000 1=1 2=512000 -Softmax prob 1 1 fc1000 output +Input data 0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3 +Convolution conv1 1 1 data conv1_conv1_relu -23330=4,3,112,112,64 0=64 1=7 3=2 4=3 5=1 6=9408 8=2 9=1 +Pooling pool1 1 1 conv1_conv1_relu pool1 -23330=4,3,56,56,64 1=3 2=2 +Split splitncnn_0 1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1 -23330=8,3,56,56,64,3,56,56,64 +Convolution res2a_branch1 1 1 pool1_splitncnn_1 res2a_branch1_scale2a_branch1 -23330=4,3,56,56,64 0=64 1=1 5=1 6=4096 8=2 +Convolution res2a_branch2a 1 1 pool1_splitncnn_0 res2a_branch2a_res2a_branch2a_relu -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=36864 8=102 9=1 +Convolution res2a_branch2b 1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b_scale2a_branch2b -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=36864 8=2 +Eltwise res2a 2 1 res2a_branch1_scale2a_branch1 res2a_branch2b_scale2a_branch2b res2a -23330=4,3,56,56,64 0=1 +ReLU res2a_relu 1 1 res2a res2a_res2a_relu -23330=4,3,56,56,64 +Split splitncnn_1 1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1 -23330=8,3,56,56,64,3,56,56,64 +Convolution res2b_branch2a 1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a_res2b_branch2a_relu -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=36864 8=102 9=1 +Convolution res2b_branch2b 1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b_scale2b_branch2b -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=36864 8=2 +Eltwise res2b 2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2b_scale2b_branch2b res2b -23330=4,3,56,56,64 0=1 +ReLU res2b_relu 1 1 res2b res2b_res2b_relu -23330=4,3,56,56,64 +Split splitncnn_2 1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1 -23330=8,3,56,56,64,3,56,56,64 +Convolution res3a_branch1 1 1 res2b_res2b_relu_splitncnn_1 res3a_branch1_scale3a_branch1 -23330=4,3,28,28,128 0=128 1=1 3=2 5=1 6=8192 8=2 +Convolution res3a_branch2a 1 1 res2b_res2b_relu_splitncnn_0 res3a_branch2a_res3a_branch2a_relu -23330=4,3,28,28,128 0=128 1=3 3=2 4=1 5=1 6=73728 8=102 9=1 +Convolution res3a_branch2b 1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b_scale3a_branch2b -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=147456 8=2 +Eltwise res3a 2 1 res3a_branch1_scale3a_branch1 res3a_branch2b_scale3a_branch2b res3a -23330=4,3,28,28,128 0=1 +ReLU res3a_relu 1 1 res3a res3a_res3a_relu -23330=4,3,28,28,128 +Split splitncnn_3 1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1 -23330=8,3,28,28,128,3,28,28,128 +Convolution res3b_branch2a 1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a_res3b_branch2a_relu -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=147456 8=102 9=1 +Convolution res3b_branch2b 1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b_scale3b_branch2b -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=147456 8=2 +Eltwise res3b 2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2b_scale3b_branch2b res3b -23330=4,3,28,28,128 0=1 +ReLU res3b_relu 1 1 res3b res3b_res3b_relu -23330=4,3,28,28,128 +Split splitncnn_4 1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1 -23330=8,3,28,28,128,3,28,28,128 +Convolution res4a_branch1 1 1 res3b_res3b_relu_splitncnn_1 res4a_branch1_scale4a_branch1 -23330=4,3,14,14,256 0=256 1=1 3=2 5=1 6=32768 8=2 +Convolution res4a_branch2a 1 1 res3b_res3b_relu_splitncnn_0 res4a_branch2a_res4a_branch2a_relu -23330=4,3,14,14,256 0=256 1=3 3=2 4=1 5=1 6=294912 8=102 9=1 +Convolution res4a_branch2b 1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b_scale4a_branch2b -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 8=2 +Eltwise res4a 2 1 res4a_branch1_scale4a_branch1 res4a_branch2b_scale4a_branch2b res4a -23330=4,3,14,14,256 0=1 +ReLU res4a_relu 1 1 res4a res4a_res4a_relu -23330=4,3,14,14,256 +Split splitncnn_5 1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1 -23330=8,3,14,14,256,3,14,14,256 +Convolution res4b_branch2a 1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a_res4b_branch2a_relu -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 8=102 9=1 +Convolution res4b_branch2b 1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b_scale4b_branch2b -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 8=2 +Eltwise res4b 2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2b_scale4b_branch2b res4b -23330=4,3,14,14,256 0=1 +ReLU res4b_relu 1 1 res4b res4b_res4b_relu -23330=4,3,14,14,256 +Split splitncnn_6 1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1 -23330=8,3,14,14,256,3,14,14,256 +Convolution res5a_branch1 1 1 res4b_res4b_relu_splitncnn_1 res5a_branch1_scale5a_branch1 -23330=4,3,7,7,512 0=512 1=1 3=2 5=1 6=131072 8=2 +Convolution res5a_branch2a 1 1 res4b_res4b_relu_splitncnn_0 res5a_branch2a_res5a_branch2a_relu -23330=4,3,7,7,512 0=512 1=3 3=2 4=1 5=1 6=1179648 8=102 9=1 +Convolution res5a_branch2b 1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b_scale5a_branch2b -23330=4,3,7,7,512 0=512 1=3 4=1 5=1 6=2359296 8=2 +Eltwise res5a 2 1 res5a_branch1_scale5a_branch1 res5a_branch2b_scale5a_branch2b res5a -23330=4,3,7,7,512 0=1 +ReLU res5a_relu 1 1 res5a res5a_res5a_relu -23330=4,3,7,7,512 +Split splitncnn_7 1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1 -23330=8,3,7,7,512,3,7,7,512 +Convolution res5b_branch2a 1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a_res5b_branch2a_relu -23330=4,3,7,7,512 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1 +Convolution res5b_branch2b 1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b_scale5b_branch2b -23330=4,3,7,7,512 0=512 1=3 4=1 5=1 6=2359296 8=2 +Eltwise res5b 2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2b_scale5b_branch2b res5b -23330=4,3,7,7,512 0=1 +ReLU res5b_relu 1 1 res5b res5b_res5b_relu -23330=4,3,7,7,512 +Pooling pool5 1 1 res5b_res5b_relu pool5 -23330=4,3,1,1,512 0=1 1=7 +InnerProduct fc1000 1 1 pool5 fc1000 -23330=4,1,1000,1,1 0=1000 1=1 2=512000 +Softmax prob 1 1 fc1000 output -23330=4,1,1000,1,1 diff --git a/benchmark/models/resnet50_int8.param b/benchmark/models/resnet50_int8.param index 48dfbf4bbd82..9970c12df020 100644 --- a/benchmark/models/resnet50_int8.param +++ b/benchmark/models/resnet50_int8.param @@ -1,108 +1,108 @@ 7767517 106 122 -Input data 0 1 data 0=224 1=224 2=3 -Convolution conv1 1 1 data conv1_conv1_relu 0=64 1=7 3=2 4=3 5=1 6=9408 8=2 9=1 -Pooling pool1 1 1 conv1_conv1_relu pool1 1=3 2=2 -Split splitncnn_0 1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1 -Convolution res2a_branch1 1 1 pool1_splitncnn_1 res2a_branch1_scale2a_branch1 0=256 1=1 5=1 6=16384 8=2 -Convolution res2a_branch2a 1 1 pool1_splitncnn_0 res2a_branch2a_res2a_branch2a_relu 0=64 1=1 5=1 6=4096 8=102 9=1 -Convolution res2a_branch2b 1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b_res2a_branch2b_relu 0=64 1=3 4=1 5=1 6=36864 8=102 9=1 -Convolution res2a_branch2c 1 1 res2a_branch2b_res2a_branch2b_relu res2a_branch2c_scale2a_branch2c 0=256 1=1 5=1 6=16384 8=2 -Eltwise res2a 2 1 res2a_branch1_scale2a_branch1 res2a_branch2c_scale2a_branch2c res2a 0=1 -ReLU res2a_relu 1 1 res2a res2a_res2a_relu -Split splitncnn_1 1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1 -Convolution res2b_branch2a 1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a_res2b_branch2a_relu 0=64 1=1 5=1 6=16384 8=102 9=1 -Convolution res2b_branch2b 1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b_res2b_branch2b_relu 0=64 1=3 4=1 5=1 6=36864 8=102 9=1 -Convolution res2b_branch2c 1 1 res2b_branch2b_res2b_branch2b_relu res2b_branch2c_scale2b_branch2c 0=256 1=1 5=1 6=16384 8=2 -Eltwise res2b 2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2c_scale2b_branch2c res2b 0=1 -ReLU res2b_relu 1 1 res2b res2b_res2b_relu -Split splitncnn_2 1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1 -Convolution res2c_branch2a 1 1 res2b_res2b_relu_splitncnn_1 res2c_branch2a_res2c_branch2a_relu 0=64 1=1 5=1 6=16384 8=102 9=1 -Convolution res2c_branch2b 1 1 res2c_branch2a_res2c_branch2a_relu res2c_branch2b_res2c_branch2b_relu 0=64 1=3 4=1 5=1 6=36864 8=102 9=1 -Convolution res2c_branch2c 1 1 res2c_branch2b_res2c_branch2b_relu res2c_branch2c_scale2c_branch2c 0=256 1=1 5=1 6=16384 8=2 -Eltwise res2c 2 1 res2b_res2b_relu_splitncnn_0 res2c_branch2c_scale2c_branch2c res2c 0=1 -ReLU res2c_relu 1 1 res2c res2c_res2c_relu -Split splitncnn_3 1 2 res2c_res2c_relu res2c_res2c_relu_splitncnn_0 res2c_res2c_relu_splitncnn_1 -Convolution res3a_branch1 1 1 res2c_res2c_relu_splitncnn_1 res3a_branch1_scale3a_branch1 0=512 1=1 3=2 5=1 6=131072 8=2 -Convolution res3a_branch2a 1 1 res2c_res2c_relu_splitncnn_0 res3a_branch2a_res3a_branch2a_relu 0=128 1=1 3=2 5=1 6=32768 8=102 9=1 -Convolution res3a_branch2b 1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b_res3a_branch2b_relu 0=128 1=3 4=1 5=1 6=147456 8=102 9=1 -Convolution res3a_branch2c 1 1 res3a_branch2b_res3a_branch2b_relu res3a_branch2c_scale3a_branch2c 0=512 1=1 5=1 6=65536 8=2 -Eltwise res3a 2 1 res3a_branch1_scale3a_branch1 res3a_branch2c_scale3a_branch2c res3a 0=1 -ReLU res3a_relu 1 1 res3a res3a_res3a_relu -Split splitncnn_4 1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1 -Convolution res3b_branch2a 1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a_res3b_branch2a_relu 0=128 1=1 5=1 6=65536 8=102 9=1 -Convolution res3b_branch2b 1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b_res3b_branch2b_relu 0=128 1=3 4=1 5=1 6=147456 8=102 9=1 -Convolution res3b_branch2c 1 1 res3b_branch2b_res3b_branch2b_relu res3b_branch2c_scale3b_branch2c 0=512 1=1 5=1 6=65536 8=2 -Eltwise res3b 2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2c_scale3b_branch2c res3b 0=1 -ReLU res3b_relu 1 1 res3b res3b_res3b_relu -Split splitncnn_5 1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1 -Convolution res3c_branch2a 1 1 res3b_res3b_relu_splitncnn_1 res3c_branch2a_res3c_branch2a_relu 0=128 1=1 5=1 6=65536 8=102 9=1 -Convolution res3c_branch2b 1 1 res3c_branch2a_res3c_branch2a_relu res3c_branch2b_res3c_branch2b_relu 0=128 1=3 4=1 5=1 6=147456 8=102 9=1 -Convolution res3c_branch2c 1 1 res3c_branch2b_res3c_branch2b_relu res3c_branch2c_scale3c_branch2c 0=512 1=1 5=1 6=65536 8=2 -Eltwise res3c 2 1 res3b_res3b_relu_splitncnn_0 res3c_branch2c_scale3c_branch2c res3c 0=1 -ReLU res3c_relu 1 1 res3c res3c_res3c_relu -Split splitncnn_6 1 2 res3c_res3c_relu res3c_res3c_relu_splitncnn_0 res3c_res3c_relu_splitncnn_1 -Convolution res3d_branch2a 1 1 res3c_res3c_relu_splitncnn_1 res3d_branch2a_res3d_branch2a_relu 0=128 1=1 5=1 6=65536 8=102 9=1 -Convolution res3d_branch2b 1 1 res3d_branch2a_res3d_branch2a_relu res3d_branch2b_res3d_branch2b_relu 0=128 1=3 4=1 5=1 6=147456 8=102 9=1 -Convolution res3d_branch2c 1 1 res3d_branch2b_res3d_branch2b_relu res3d_branch2c_scale3d_branch2c 0=512 1=1 5=1 6=65536 8=2 -Eltwise res3d 2 1 res3c_res3c_relu_splitncnn_0 res3d_branch2c_scale3d_branch2c res3d 0=1 -ReLU res3d_relu 1 1 res3d res3d_res3d_relu -Split splitncnn_7 1 2 res3d_res3d_relu res3d_res3d_relu_splitncnn_0 res3d_res3d_relu_splitncnn_1 -Convolution res4a_branch1 1 1 res3d_res3d_relu_splitncnn_1 res4a_branch1_scale4a_branch1 0=1024 1=1 3=2 5=1 6=524288 8=2 -Convolution res4a_branch2a 1 1 res3d_res3d_relu_splitncnn_0 res4a_branch2a_res4a_branch2a_relu 0=256 1=1 3=2 5=1 6=131072 8=102 9=1 -Convolution res4a_branch2b 1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b_res4a_branch2b_relu 0=256 1=3 4=1 5=1 6=589824 8=102 9=1 -Convolution res4a_branch2c 1 1 res4a_branch2b_res4a_branch2b_relu res4a_branch2c_scale4a_branch2c 0=1024 1=1 5=1 6=262144 8=2 -Eltwise res4a 2 1 res4a_branch1_scale4a_branch1 res4a_branch2c_scale4a_branch2c res4a 0=1 -ReLU res4a_relu 1 1 res4a res4a_res4a_relu -Split splitncnn_8 1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1 -Convolution res4b_branch2a 1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a_res4b_branch2a_relu 0=256 1=1 5=1 6=262144 8=102 9=1 -Convolution res4b_branch2b 1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b_res4b_branch2b_relu 0=256 1=3 4=1 5=1 6=589824 8=102 9=1 -Convolution res4b_branch2c 1 1 res4b_branch2b_res4b_branch2b_relu res4b_branch2c_scale4b_branch2c 0=1024 1=1 5=1 6=262144 8=2 -Eltwise res4b 2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2c_scale4b_branch2c res4b 0=1 -ReLU res4b_relu 1 1 res4b res4b_res4b_relu -Split splitncnn_9 1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1 -Convolution res4c_branch2a 1 1 res4b_res4b_relu_splitncnn_1 res4c_branch2a_res4c_branch2a_relu 0=256 1=1 5=1 6=262144 8=102 9=1 -Convolution res4c_branch2b 1 1 res4c_branch2a_res4c_branch2a_relu res4c_branch2b_res4c_branch2b_relu 0=256 1=3 4=1 5=1 6=589824 8=102 9=1 -Convolution res4c_branch2c 1 1 res4c_branch2b_res4c_branch2b_relu res4c_branch2c_scale4c_branch2c 0=1024 1=1 5=1 6=262144 8=2 -Eltwise res4c 2 1 res4b_res4b_relu_splitncnn_0 res4c_branch2c_scale4c_branch2c res4c 0=1 -ReLU res4c_relu 1 1 res4c res4c_res4c_relu -Split splitncnn_10 1 2 res4c_res4c_relu res4c_res4c_relu_splitncnn_0 res4c_res4c_relu_splitncnn_1 -Convolution res4d_branch2a 1 1 res4c_res4c_relu_splitncnn_1 res4d_branch2a_res4d_branch2a_relu 0=256 1=1 5=1 6=262144 8=102 9=1 -Convolution res4d_branch2b 1 1 res4d_branch2a_res4d_branch2a_relu res4d_branch2b_res4d_branch2b_relu 0=256 1=3 4=1 5=1 6=589824 8=102 9=1 -Convolution res4d_branch2c 1 1 res4d_branch2b_res4d_branch2b_relu res4d_branch2c_scale4d_branch2c 0=1024 1=1 5=1 6=262144 8=2 -Eltwise res4d 2 1 res4c_res4c_relu_splitncnn_0 res4d_branch2c_scale4d_branch2c res4d 0=1 -ReLU res4d_relu 1 1 res4d res4d_res4d_relu -Split splitncnn_11 1 2 res4d_res4d_relu res4d_res4d_relu_splitncnn_0 res4d_res4d_relu_splitncnn_1 -Convolution res4e_branch2a 1 1 res4d_res4d_relu_splitncnn_1 res4e_branch2a_res4e_branch2a_relu 0=256 1=1 5=1 6=262144 8=102 9=1 -Convolution res4e_branch2b 1 1 res4e_branch2a_res4e_branch2a_relu res4e_branch2b_res4e_branch2b_relu 0=256 1=3 4=1 5=1 6=589824 8=102 9=1 -Convolution res4e_branch2c 1 1 res4e_branch2b_res4e_branch2b_relu res4e_branch2c_scale4e_branch2c 0=1024 1=1 5=1 6=262144 8=2 -Eltwise res4e 2 1 res4d_res4d_relu_splitncnn_0 res4e_branch2c_scale4e_branch2c res4e 0=1 -ReLU res4e_relu 1 1 res4e res4e_res4e_relu -Split splitncnn_12 1 2 res4e_res4e_relu res4e_res4e_relu_splitncnn_0 res4e_res4e_relu_splitncnn_1 -Convolution res4f_branch2a 1 1 res4e_res4e_relu_splitncnn_1 res4f_branch2a_res4f_branch2a_relu 0=256 1=1 5=1 6=262144 8=102 9=1 -Convolution res4f_branch2b 1 1 res4f_branch2a_res4f_branch2a_relu res4f_branch2b_res4f_branch2b_relu 0=256 1=3 4=1 5=1 6=589824 8=102 9=1 -Convolution res4f_branch2c 1 1 res4f_branch2b_res4f_branch2b_relu res4f_branch2c_scale4f_branch2c 0=1024 1=1 5=1 6=262144 8=2 -Eltwise res4f 2 1 res4e_res4e_relu_splitncnn_0 res4f_branch2c_scale4f_branch2c res4f 0=1 -ReLU res4f_relu 1 1 res4f res4f_res4f_relu -Split splitncnn_13 1 2 res4f_res4f_relu res4f_res4f_relu_splitncnn_0 res4f_res4f_relu_splitncnn_1 -Convolution res5a_branch1 1 1 res4f_res4f_relu_splitncnn_1 res5a_branch1_scale5a_branch1 0=2048 1=1 3=2 5=1 6=2097152 8=2 -Convolution res5a_branch2a 1 1 res4f_res4f_relu_splitncnn_0 res5a_branch2a_res5a_branch2a_relu 0=512 1=1 3=2 5=1 6=524288 8=102 9=1 -Convolution res5a_branch2b 1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b_res5a_branch2b_relu 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1 -Convolution res5a_branch2c 1 1 res5a_branch2b_res5a_branch2b_relu res5a_branch2c_scale5a_branch2c 0=2048 1=1 5=1 6=1048576 8=2 -Eltwise res5a 2 1 res5a_branch1_scale5a_branch1 res5a_branch2c_scale5a_branch2c res5a 0=1 -ReLU res5a_relu 1 1 res5a res5a_res5a_relu -Split splitncnn_14 1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1 -Convolution res5b_branch2a 1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a_res5b_branch2a_relu 0=512 1=1 5=1 6=1048576 8=102 9=1 -Convolution res5b_branch2b 1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b_res5b_branch2b_relu 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1 -Convolution res5b_branch2c 1 1 res5b_branch2b_res5b_branch2b_relu res5b_branch2c_scale5b_branch2c 0=2048 1=1 5=1 6=1048576 8=2 -Eltwise res5b 2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2c_scale5b_branch2c res5b 0=1 -ReLU res5b_relu 1 1 res5b res5b_res5b_relu -Split splitncnn_15 1 2 res5b_res5b_relu res5b_res5b_relu_splitncnn_0 res5b_res5b_relu_splitncnn_1 -Convolution res5c_branch2a 1 1 res5b_res5b_relu_splitncnn_1 res5c_branch2a_res5c_branch2a_relu 0=512 1=1 5=1 6=1048576 8=102 9=1 -Convolution res5c_branch2b 1 1 res5c_branch2a_res5c_branch2a_relu res5c_branch2b_res5c_branch2b_relu 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1 -Convolution res5c_branch2c 1 1 res5c_branch2b_res5c_branch2b_relu res5c_branch2c_scale5c_branch2c 0=2048 1=1 5=1 6=1048576 8=2 -Eltwise res5c 2 1 res5b_res5b_relu_splitncnn_0 res5c_branch2c_scale5c_branch2c res5c 0=1 -ReLU res5c_relu 1 1 res5c res5c_res5c_relu -Pooling pool5 1 1 res5c_res5c_relu pool5 0=1 1=7 -InnerProduct fc1000 1 1 pool5 fc1000 0=1000 1=1 2=2048000 -Softmax prob 1 1 fc1000 output +Input data 0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3 +Convolution conv1 1 1 data conv1_conv1_relu -23330=4,3,112,112,64 0=64 1=7 3=2 4=3 5=1 6=9408 8=2 9=1 +Pooling pool1 1 1 conv1_conv1_relu pool1 -23330=4,3,56,56,64 1=3 2=2 +Split splitncnn_0 1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1 -23330=8,3,56,56,64,3,56,56,64 +Convolution res2a_branch1 1 1 pool1_splitncnn_1 res2a_branch1_scale2a_branch1 -23330=4,3,56,56,256 0=256 1=1 5=1 6=16384 8=2 +Convolution res2a_branch2a 1 1 pool1_splitncnn_0 res2a_branch2a_res2a_branch2a_relu -23330=4,3,56,56,64 0=64 1=1 5=1 6=4096 8=102 9=1 +Convolution res2a_branch2b 1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b_res2a_branch2b_relu -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=36864 8=102 9=1 +Convolution res2a_branch2c 1 1 res2a_branch2b_res2a_branch2b_relu res2a_branch2c_scale2a_branch2c -23330=4,3,56,56,256 0=256 1=1 5=1 6=16384 8=2 +Eltwise res2a 2 1 res2a_branch1_scale2a_branch1 res2a_branch2c_scale2a_branch2c res2a -23330=4,3,56,56,256 0=1 +ReLU res2a_relu 1 1 res2a res2a_res2a_relu -23330=4,3,56,56,256 +Split splitncnn_1 1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1 -23330=8,3,56,56,256,3,56,56,256 +Convolution res2b_branch2a 1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a_res2b_branch2a_relu -23330=4,3,56,56,64 0=64 1=1 5=1 6=16384 8=102 9=1 +Convolution res2b_branch2b 1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b_res2b_branch2b_relu -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=36864 8=102 9=1 +Convolution res2b_branch2c 1 1 res2b_branch2b_res2b_branch2b_relu res2b_branch2c_scale2b_branch2c -23330=4,3,56,56,256 0=256 1=1 5=1 6=16384 8=2 +Eltwise res2b 2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2c_scale2b_branch2c res2b -23330=4,3,56,56,256 0=1 +ReLU res2b_relu 1 1 res2b res2b_res2b_relu -23330=4,3,56,56,256 +Split splitncnn_2 1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1 -23330=8,3,56,56,256,3,56,56,256 +Convolution res2c_branch2a 1 1 res2b_res2b_relu_splitncnn_1 res2c_branch2a_res2c_branch2a_relu -23330=4,3,56,56,64 0=64 1=1 5=1 6=16384 8=102 9=1 +Convolution res2c_branch2b 1 1 res2c_branch2a_res2c_branch2a_relu res2c_branch2b_res2c_branch2b_relu -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=36864 8=102 9=1 +Convolution res2c_branch2c 1 1 res2c_branch2b_res2c_branch2b_relu res2c_branch2c_scale2c_branch2c -23330=4,3,56,56,256 0=256 1=1 5=1 6=16384 8=2 +Eltwise res2c 2 1 res2b_res2b_relu_splitncnn_0 res2c_branch2c_scale2c_branch2c res2c -23330=4,3,56,56,256 0=1 +ReLU res2c_relu 1 1 res2c res2c_res2c_relu -23330=4,3,56,56,256 +Split splitncnn_3 1 2 res2c_res2c_relu res2c_res2c_relu_splitncnn_0 res2c_res2c_relu_splitncnn_1 -23330=8,3,56,56,256,3,56,56,256 +Convolution res3a_branch1 1 1 res2c_res2c_relu_splitncnn_1 res3a_branch1_scale3a_branch1 -23330=4,3,28,28,512 0=512 1=1 3=2 5=1 6=131072 8=2 +Convolution res3a_branch2a 1 1 res2c_res2c_relu_splitncnn_0 res3a_branch2a_res3a_branch2a_relu -23330=4,3,28,28,128 0=128 1=1 3=2 5=1 6=32768 8=102 9=1 +Convolution res3a_branch2b 1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b_res3a_branch2b_relu -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=147456 8=102 9=1 +Convolution res3a_branch2c 1 1 res3a_branch2b_res3a_branch2b_relu res3a_branch2c_scale3a_branch2c -23330=4,3,28,28,512 0=512 1=1 5=1 6=65536 8=2 +Eltwise res3a 2 1 res3a_branch1_scale3a_branch1 res3a_branch2c_scale3a_branch2c res3a -23330=4,3,28,28,512 0=1 +ReLU res3a_relu 1 1 res3a res3a_res3a_relu -23330=4,3,28,28,512 +Split splitncnn_4 1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1 -23330=8,3,28,28,512,3,28,28,512 +Convolution res3b_branch2a 1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a_res3b_branch2a_relu -23330=4,3,28,28,128 0=128 1=1 5=1 6=65536 8=102 9=1 +Convolution res3b_branch2b 1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b_res3b_branch2b_relu -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=147456 8=102 9=1 +Convolution res3b_branch2c 1 1 res3b_branch2b_res3b_branch2b_relu res3b_branch2c_scale3b_branch2c -23330=4,3,28,28,512 0=512 1=1 5=1 6=65536 8=2 +Eltwise res3b 2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2c_scale3b_branch2c res3b -23330=4,3,28,28,512 0=1 +ReLU res3b_relu 1 1 res3b res3b_res3b_relu -23330=4,3,28,28,512 +Split splitncnn_5 1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1 -23330=8,3,28,28,512,3,28,28,512 +Convolution res3c_branch2a 1 1 res3b_res3b_relu_splitncnn_1 res3c_branch2a_res3c_branch2a_relu -23330=4,3,28,28,128 0=128 1=1 5=1 6=65536 8=102 9=1 +Convolution res3c_branch2b 1 1 res3c_branch2a_res3c_branch2a_relu res3c_branch2b_res3c_branch2b_relu -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=147456 8=102 9=1 +Convolution res3c_branch2c 1 1 res3c_branch2b_res3c_branch2b_relu res3c_branch2c_scale3c_branch2c -23330=4,3,28,28,512 0=512 1=1 5=1 6=65536 8=2 +Eltwise res3c 2 1 res3b_res3b_relu_splitncnn_0 res3c_branch2c_scale3c_branch2c res3c -23330=4,3,28,28,512 0=1 +ReLU res3c_relu 1 1 res3c res3c_res3c_relu -23330=4,3,28,28,512 +Split splitncnn_6 1 2 res3c_res3c_relu res3c_res3c_relu_splitncnn_0 res3c_res3c_relu_splitncnn_1 -23330=8,3,28,28,512,3,28,28,512 +Convolution res3d_branch2a 1 1 res3c_res3c_relu_splitncnn_1 res3d_branch2a_res3d_branch2a_relu -23330=4,3,28,28,128 0=128 1=1 5=1 6=65536 8=102 9=1 +Convolution res3d_branch2b 1 1 res3d_branch2a_res3d_branch2a_relu res3d_branch2b_res3d_branch2b_relu -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=147456 8=102 9=1 +Convolution res3d_branch2c 1 1 res3d_branch2b_res3d_branch2b_relu res3d_branch2c_scale3d_branch2c -23330=4,3,28,28,512 0=512 1=1 5=1 6=65536 8=2 +Eltwise res3d 2 1 res3c_res3c_relu_splitncnn_0 res3d_branch2c_scale3d_branch2c res3d -23330=4,3,28,28,512 0=1 +ReLU res3d_relu 1 1 res3d res3d_res3d_relu -23330=4,3,28,28,512 +Split splitncnn_7 1 2 res3d_res3d_relu res3d_res3d_relu_splitncnn_0 res3d_res3d_relu_splitncnn_1 -23330=8,3,28,28,512,3,28,28,512 +Convolution res4a_branch1 1 1 res3d_res3d_relu_splitncnn_1 res4a_branch1_scale4a_branch1 -23330=4,3,14,14,1024 0=1024 1=1 3=2 5=1 6=524288 8=2 +Convolution res4a_branch2a 1 1 res3d_res3d_relu_splitncnn_0 res4a_branch2a_res4a_branch2a_relu -23330=4,3,14,14,256 0=256 1=1 3=2 5=1 6=131072 8=102 9=1 +Convolution res4a_branch2b 1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b_res4a_branch2b_relu -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 8=102 9=1 +Convolution res4a_branch2c 1 1 res4a_branch2b_res4a_branch2b_relu res4a_branch2c_scale4a_branch2c -23330=4,3,14,14,1024 0=1024 1=1 5=1 6=262144 8=2 +Eltwise res4a 2 1 res4a_branch1_scale4a_branch1 res4a_branch2c_scale4a_branch2c res4a -23330=4,3,14,14,1024 0=1 +ReLU res4a_relu 1 1 res4a res4a_res4a_relu -23330=4,3,14,14,1024 +Split splitncnn_8 1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1 -23330=8,3,14,14,1024,3,14,14,1024 +Convolution res4b_branch2a 1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a_res4b_branch2a_relu -23330=4,3,14,14,256 0=256 1=1 5=1 6=262144 8=102 9=1 +Convolution res4b_branch2b 1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b_res4b_branch2b_relu -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 8=102 9=1 +Convolution res4b_branch2c 1 1 res4b_branch2b_res4b_branch2b_relu res4b_branch2c_scale4b_branch2c -23330=4,3,14,14,1024 0=1024 1=1 5=1 6=262144 8=2 +Eltwise res4b 2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2c_scale4b_branch2c res4b -23330=4,3,14,14,1024 0=1 +ReLU res4b_relu 1 1 res4b res4b_res4b_relu -23330=4,3,14,14,1024 +Split splitncnn_9 1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1 -23330=8,3,14,14,1024,3,14,14,1024 +Convolution res4c_branch2a 1 1 res4b_res4b_relu_splitncnn_1 res4c_branch2a_res4c_branch2a_relu -23330=4,3,14,14,256 0=256 1=1 5=1 6=262144 8=102 9=1 +Convolution res4c_branch2b 1 1 res4c_branch2a_res4c_branch2a_relu res4c_branch2b_res4c_branch2b_relu -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 8=102 9=1 +Convolution res4c_branch2c 1 1 res4c_branch2b_res4c_branch2b_relu res4c_branch2c_scale4c_branch2c -23330=4,3,14,14,1024 0=1024 1=1 5=1 6=262144 8=2 +Eltwise res4c 2 1 res4b_res4b_relu_splitncnn_0 res4c_branch2c_scale4c_branch2c res4c -23330=4,3,14,14,1024 0=1 +ReLU res4c_relu 1 1 res4c res4c_res4c_relu -23330=4,3,14,14,1024 +Split splitncnn_10 1 2 res4c_res4c_relu res4c_res4c_relu_splitncnn_0 res4c_res4c_relu_splitncnn_1 -23330=8,3,14,14,1024,3,14,14,1024 +Convolution res4d_branch2a 1 1 res4c_res4c_relu_splitncnn_1 res4d_branch2a_res4d_branch2a_relu -23330=4,3,14,14,256 0=256 1=1 5=1 6=262144 8=102 9=1 +Convolution res4d_branch2b 1 1 res4d_branch2a_res4d_branch2a_relu res4d_branch2b_res4d_branch2b_relu -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 8=102 9=1 +Convolution res4d_branch2c 1 1 res4d_branch2b_res4d_branch2b_relu res4d_branch2c_scale4d_branch2c -23330=4,3,14,14,1024 0=1024 1=1 5=1 6=262144 8=2 +Eltwise res4d 2 1 res4c_res4c_relu_splitncnn_0 res4d_branch2c_scale4d_branch2c res4d -23330=4,3,14,14,1024 0=1 +ReLU res4d_relu 1 1 res4d res4d_res4d_relu -23330=4,3,14,14,1024 +Split splitncnn_11 1 2 res4d_res4d_relu res4d_res4d_relu_splitncnn_0 res4d_res4d_relu_splitncnn_1 -23330=8,3,14,14,1024,3,14,14,1024 +Convolution res4e_branch2a 1 1 res4d_res4d_relu_splitncnn_1 res4e_branch2a_res4e_branch2a_relu -23330=4,3,14,14,256 0=256 1=1 5=1 6=262144 8=102 9=1 +Convolution res4e_branch2b 1 1 res4e_branch2a_res4e_branch2a_relu res4e_branch2b_res4e_branch2b_relu -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 8=102 9=1 +Convolution res4e_branch2c 1 1 res4e_branch2b_res4e_branch2b_relu res4e_branch2c_scale4e_branch2c -23330=4,3,14,14,1024 0=1024 1=1 5=1 6=262144 8=2 +Eltwise res4e 2 1 res4d_res4d_relu_splitncnn_0 res4e_branch2c_scale4e_branch2c res4e -23330=4,3,14,14,1024 0=1 +ReLU res4e_relu 1 1 res4e res4e_res4e_relu -23330=4,3,14,14,1024 +Split splitncnn_12 1 2 res4e_res4e_relu res4e_res4e_relu_splitncnn_0 res4e_res4e_relu_splitncnn_1 -23330=8,3,14,14,1024,3,14,14,1024 +Convolution res4f_branch2a 1 1 res4e_res4e_relu_splitncnn_1 res4f_branch2a_res4f_branch2a_relu -23330=4,3,14,14,256 0=256 1=1 5=1 6=262144 8=102 9=1 +Convolution res4f_branch2b 1 1 res4f_branch2a_res4f_branch2a_relu res4f_branch2b_res4f_branch2b_relu -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 8=102 9=1 +Convolution res4f_branch2c 1 1 res4f_branch2b_res4f_branch2b_relu res4f_branch2c_scale4f_branch2c -23330=4,3,14,14,1024 0=1024 1=1 5=1 6=262144 8=2 +Eltwise res4f 2 1 res4e_res4e_relu_splitncnn_0 res4f_branch2c_scale4f_branch2c res4f -23330=4,3,14,14,1024 0=1 +ReLU res4f_relu 1 1 res4f res4f_res4f_relu -23330=4,3,14,14,1024 +Split splitncnn_13 1 2 res4f_res4f_relu res4f_res4f_relu_splitncnn_0 res4f_res4f_relu_splitncnn_1 -23330=8,3,14,14,1024,3,14,14,1024 +Convolution res5a_branch1 1 1 res4f_res4f_relu_splitncnn_1 res5a_branch1_scale5a_branch1 -23330=4,3,7,7,2048 0=2048 1=1 3=2 5=1 6=2097152 8=2 +Convolution res5a_branch2a 1 1 res4f_res4f_relu_splitncnn_0 res5a_branch2a_res5a_branch2a_relu -23330=4,3,7,7,512 0=512 1=1 3=2 5=1 6=524288 8=102 9=1 +Convolution res5a_branch2b 1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b_res5a_branch2b_relu -23330=4,3,7,7,512 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1 +Convolution res5a_branch2c 1 1 res5a_branch2b_res5a_branch2b_relu res5a_branch2c_scale5a_branch2c -23330=4,3,7,7,2048 0=2048 1=1 5=1 6=1048576 8=2 +Eltwise res5a 2 1 res5a_branch1_scale5a_branch1 res5a_branch2c_scale5a_branch2c res5a -23330=4,3,7,7,2048 0=1 +ReLU res5a_relu 1 1 res5a res5a_res5a_relu -23330=4,3,7,7,2048 +Split splitncnn_14 1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1 -23330=8,3,7,7,2048,3,7,7,2048 +Convolution res5b_branch2a 1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a_res5b_branch2a_relu -23330=4,3,7,7,512 0=512 1=1 5=1 6=1048576 8=102 9=1 +Convolution res5b_branch2b 1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b_res5b_branch2b_relu -23330=4,3,7,7,512 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1 +Convolution res5b_branch2c 1 1 res5b_branch2b_res5b_branch2b_relu res5b_branch2c_scale5b_branch2c -23330=4,3,7,7,2048 0=2048 1=1 5=1 6=1048576 8=2 +Eltwise res5b 2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2c_scale5b_branch2c res5b -23330=4,3,7,7,2048 0=1 +ReLU res5b_relu 1 1 res5b res5b_res5b_relu -23330=4,3,7,7,2048 +Split splitncnn_15 1 2 res5b_res5b_relu res5b_res5b_relu_splitncnn_0 res5b_res5b_relu_splitncnn_1 -23330=8,3,7,7,2048,3,7,7,2048 +Convolution res5c_branch2a 1 1 res5b_res5b_relu_splitncnn_1 res5c_branch2a_res5c_branch2a_relu -23330=4,3,7,7,512 0=512 1=1 5=1 6=1048576 8=102 9=1 +Convolution res5c_branch2b 1 1 res5c_branch2a_res5c_branch2a_relu res5c_branch2b_res5c_branch2b_relu -23330=4,3,7,7,512 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1 +Convolution res5c_branch2c 1 1 res5c_branch2b_res5c_branch2b_relu res5c_branch2c_scale5c_branch2c -23330=4,3,7,7,2048 0=2048 1=1 5=1 6=1048576 8=2 +Eltwise res5c 2 1 res5b_res5b_relu_splitncnn_0 res5c_branch2c_scale5c_branch2c res5c -23330=4,3,7,7,2048 0=1 +ReLU res5c_relu 1 1 res5c res5c_res5c_relu -23330=4,3,7,7,2048 +Pooling pool5 1 1 res5c_res5c_relu pool5 -23330=4,3,1,1,2048 0=1 1=7 +InnerProduct fc1000 1 1 pool5 fc1000 -23330=4,1,1000,1,1 0=1000 1=1 2=2048000 +Softmax prob 1 1 fc1000 output -23330=4,1,1000,1,1 diff --git a/benchmark/models/squeezenet_int8.param b/benchmark/models/squeezenet_int8.param index c12699380517..b29447ee40f0 100644 --- a/benchmark/models/squeezenet_int8.param +++ b/benchmark/models/squeezenet_int8.param @@ -1,50 +1,50 @@ 7767517 48 56 -Input data 0 1 data 0=227 1=227 2=3 -Convolution conv1 1 1 data conv1_relu_conv1 0=64 1=3 3=2 5=1 6=1728 8=2 9=1 -Pooling pool1 1 1 conv1_relu_conv1 pool1 1=3 2=2 -Convolution fire2/squeeze1x1 1 1 pool1 fire2/squeeze1x1_fire2/relu_squeeze1x1 0=16 1=1 5=1 6=1024 8=102 9=1 -Split splitncnn_0 1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 -Convolution fire2/expand1x1 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1_fire2/relu_expand1x1 0=64 1=1 5=1 6=1024 8=2 9=1 -Convolution fire2/expand3x3 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3_fire2/relu_expand3x3 0=64 1=3 4=1 5=1 6=9216 8=2 9=1 -Concat fire2/concat 2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat -Convolution fire3/squeeze1x1 1 1 fire2/concat fire3/squeeze1x1_fire3/relu_squeeze1x1 0=16 1=1 5=1 6=2048 8=102 9=1 -Split splitncnn_1 1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 -Convolution fire3/expand1x1 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1_fire3/relu_expand1x1 0=64 1=1 5=1 6=1024 8=2 9=1 -Convolution fire3/expand3x3 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3_fire3/relu_expand3x3 0=64 1=3 4=1 5=1 6=9216 8=2 9=1 -Concat fire3/concat 2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat -Pooling pool3 1 1 fire3/concat pool3 1=3 2=2 -Convolution fire4/squeeze1x1 1 1 pool3 fire4/squeeze1x1_fire4/relu_squeeze1x1 0=32 1=1 5=1 6=4096 8=102 9=1 -Split splitncnn_2 1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 -Convolution fire4/expand1x1 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1_fire4/relu_expand1x1 0=128 1=1 5=1 6=4096 8=2 9=1 -Convolution fire4/expand3x3 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3_fire4/relu_expand3x3 0=128 1=3 4=1 5=1 6=36864 8=2 9=1 -Concat fire4/concat 2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat -Convolution fire5/squeeze1x1 1 1 fire4/concat fire5/squeeze1x1_fire5/relu_squeeze1x1 0=32 1=1 5=1 6=8192 8=102 9=1 -Split splitncnn_3 1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 -Convolution fire5/expand1x1 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1_fire5/relu_expand1x1 0=128 1=1 5=1 6=4096 8=2 9=1 -Convolution fire5/expand3x3 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3_fire5/relu_expand3x3 0=128 1=3 4=1 5=1 6=36864 8=2 9=1 -Concat fire5/concat 2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat -Pooling pool5 1 1 fire5/concat pool5 1=3 2=2 -Convolution fire6/squeeze1x1 1 1 pool5 fire6/squeeze1x1_fire6/relu_squeeze1x1 0=48 1=1 5=1 6=12288 8=102 9=1 -Split splitncnn_4 1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 -Convolution fire6/expand1x1 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1_fire6/relu_expand1x1 0=192 1=1 5=1 6=9216 8=2 9=1 -Convolution fire6/expand3x3 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3_fire6/relu_expand3x3 0=192 1=3 4=1 5=1 6=82944 8=2 9=1 -Concat fire6/concat 2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat -Convolution fire7/squeeze1x1 1 1 fire6/concat fire7/squeeze1x1_fire7/relu_squeeze1x1 0=48 1=1 5=1 6=18432 8=102 9=1 -Split splitncnn_5 1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 -Convolution fire7/expand1x1 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1_fire7/relu_expand1x1 0=192 1=1 5=1 6=9216 8=2 9=1 -Convolution fire7/expand3x3 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3_fire7/relu_expand3x3 0=192 1=3 4=1 5=1 6=82944 8=2 9=1 -Concat fire7/concat 2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat -Convolution fire8/squeeze1x1 1 1 fire7/concat fire8/squeeze1x1_fire8/relu_squeeze1x1 0=64 1=1 5=1 6=24576 8=102 9=1 -Split splitncnn_6 1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 -Convolution fire8/expand1x1 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1_fire8/relu_expand1x1 0=256 1=1 5=1 6=16384 8=2 9=1 -Convolution fire8/expand3x3 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3_fire8/relu_expand3x3 0=256 1=3 4=1 5=1 6=147456 8=2 9=1 -Concat fire8/concat 2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat -Convolution fire9/squeeze1x1 1 1 fire8/concat fire9/squeeze1x1_fire9/relu_squeeze1x1 0=64 1=1 5=1 6=32768 8=102 9=1 -Split splitncnn_7 1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 -Convolution fire9/expand1x1 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1_fire9/relu_expand1x1 0=256 1=1 5=1 6=16384 8=2 9=1 -Convolution fire9/expand3x3 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3_fire9/relu_expand3x3 0=256 1=3 4=1 5=1 6=147456 8=2 9=1 -Concat fire9/concat 2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat_drop9 -Convolution conv10 1 1 fire9/concat_drop9 conv10_relu_conv10 0=1000 1=1 4=1 5=1 6=512000 8=2 9=1 -Pooling pool10 1 1 conv10_relu_conv10 pool10 0=1 4=1 -Softmax prob 1 1 pool10 output +Input data 0 1 data -23330=4,3,227,227,3 0=227 1=227 2=3 +Convolution conv1 1 1 data conv1_relu_conv1 -23330=4,3,113,113,64 0=64 1=3 3=2 5=1 6=1728 8=2 9=1 +Pooling pool1 1 1 conv1_relu_conv1 pool1 -23330=4,3,56,56,64 1=3 2=2 +Convolution fire2/squeeze1x1 1 1 pool1 fire2/squeeze1x1_fire2/relu_squeeze1x1 -23330=4,3,56,56,16 0=16 1=1 5=1 6=1024 8=102 9=1 +Split splitncnn_0 1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 -23330=8,3,56,56,16,3,56,56,16 +Convolution fire2/expand1x1 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1_fire2/relu_expand1x1 -23330=4,3,56,56,64 0=64 1=1 5=1 6=1024 8=2 9=1 +Convolution fire2/expand3x3 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3_fire2/relu_expand3x3 -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=9216 8=2 9=1 +Concat fire2/concat 2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat -23330=4,3,56,56,128 +Convolution fire3/squeeze1x1 1 1 fire2/concat fire3/squeeze1x1_fire3/relu_squeeze1x1 -23330=4,3,56,56,16 0=16 1=1 5=1 6=2048 8=102 9=1 +Split splitncnn_1 1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 -23330=8,3,56,56,16,3,56,56,16 +Convolution fire3/expand1x1 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1_fire3/relu_expand1x1 -23330=4,3,56,56,64 0=64 1=1 5=1 6=1024 8=2 9=1 +Convolution fire3/expand3x3 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3_fire3/relu_expand3x3 -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=9216 8=2 9=1 +Concat fire3/concat 2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat -23330=4,3,56,56,128 +Pooling pool3 1 1 fire3/concat pool3 -23330=4,3,28,28,128 1=3 2=2 +Convolution fire4/squeeze1x1 1 1 pool3 fire4/squeeze1x1_fire4/relu_squeeze1x1 -23330=4,3,28,28,32 0=32 1=1 5=1 6=4096 8=102 9=1 +Split splitncnn_2 1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 -23330=8,3,28,28,32,3,28,28,32 +Convolution fire4/expand1x1 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1_fire4/relu_expand1x1 -23330=4,3,28,28,128 0=128 1=1 5=1 6=4096 8=2 9=1 +Convolution fire4/expand3x3 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3_fire4/relu_expand3x3 -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=36864 8=2 9=1 +Concat fire4/concat 2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat -23330=4,3,28,28,256 +Convolution fire5/squeeze1x1 1 1 fire4/concat fire5/squeeze1x1_fire5/relu_squeeze1x1 -23330=4,3,28,28,32 0=32 1=1 5=1 6=8192 8=102 9=1 +Split splitncnn_3 1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 -23330=8,3,28,28,32,3,28,28,32 +Convolution fire5/expand1x1 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1_fire5/relu_expand1x1 -23330=4,3,28,28,128 0=128 1=1 5=1 6=4096 8=2 9=1 +Convolution fire5/expand3x3 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3_fire5/relu_expand3x3 -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=36864 8=2 9=1 +Concat fire5/concat 2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat -23330=4,3,28,28,256 +Pooling pool5 1 1 fire5/concat pool5 -23330=4,3,14,14,256 1=3 2=2 +Convolution fire6/squeeze1x1 1 1 pool5 fire6/squeeze1x1_fire6/relu_squeeze1x1 -23330=4,3,14,14,48 0=48 1=1 5=1 6=12288 8=102 9=1 +Split splitncnn_4 1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 -23330=8,3,14,14,48,3,14,14,48 +Convolution fire6/expand1x1 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1_fire6/relu_expand1x1 -23330=4,3,14,14,192 0=192 1=1 5=1 6=9216 8=2 9=1 +Convolution fire6/expand3x3 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3_fire6/relu_expand3x3 -23330=4,3,14,14,192 0=192 1=3 4=1 5=1 6=82944 8=2 9=1 +Concat fire6/concat 2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat -23330=4,3,14,14,384 +Convolution fire7/squeeze1x1 1 1 fire6/concat fire7/squeeze1x1_fire7/relu_squeeze1x1 -23330=4,3,14,14,48 0=48 1=1 5=1 6=18432 8=102 9=1 +Split splitncnn_5 1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 -23330=8,3,14,14,48,3,14,14,48 +Convolution fire7/expand1x1 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1_fire7/relu_expand1x1 -23330=4,3,14,14,192 0=192 1=1 5=1 6=9216 8=2 9=1 +Convolution fire7/expand3x3 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3_fire7/relu_expand3x3 -23330=4,3,14,14,192 0=192 1=3 4=1 5=1 6=82944 8=2 9=1 +Concat fire7/concat 2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat -23330=4,3,14,14,384 +Convolution fire8/squeeze1x1 1 1 fire7/concat fire8/squeeze1x1_fire8/relu_squeeze1x1 -23330=4,3,14,14,64 0=64 1=1 5=1 6=24576 8=102 9=1 +Split splitncnn_6 1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 -23330=8,3,14,14,64,3,14,14,64 +Convolution fire8/expand1x1 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1_fire8/relu_expand1x1 -23330=4,3,14,14,256 0=256 1=1 5=1 6=16384 8=2 9=1 +Convolution fire8/expand3x3 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3_fire8/relu_expand3x3 -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=147456 8=2 9=1 +Concat fire8/concat 2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat -23330=4,3,14,14,512 +Convolution fire9/squeeze1x1 1 1 fire8/concat fire9/squeeze1x1_fire9/relu_squeeze1x1 -23330=4,3,14,14,64 0=64 1=1 5=1 6=32768 8=102 9=1 +Split splitncnn_7 1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 -23330=8,3,14,14,64,3,14,14,64 +Convolution fire9/expand1x1 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1_fire9/relu_expand1x1 -23330=4,3,14,14,256 0=256 1=1 5=1 6=16384 8=2 9=1 +Convolution fire9/expand3x3 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3_fire9/relu_expand3x3 -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=147456 8=2 9=1 +Concat fire9/concat 2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat_drop9 -23330=4,3,14,14,512 +Convolution conv10 1 1 fire9/concat_drop9 conv10_relu_conv10 -23330=4,3,16,16,1000 0=1000 1=1 4=1 5=1 6=512000 8=2 9=1 +Pooling pool10 1 1 conv10_relu_conv10 pool10 -23330=4,1,1000,1,1 0=1 4=1 +Softmax prob 1 1 pool10 output -23330=4,1,1000,1,1 diff --git a/benchmark/models/squeezenet_ssd_int8.param b/benchmark/models/squeezenet_ssd_int8.param index 177d18729cd3..050166202d62 100644 --- a/benchmark/models/squeezenet_ssd_int8.param +++ b/benchmark/models/squeezenet_ssd_int8.param @@ -1,121 +1,121 @@ 7767517 119 152 -Input data 0 1 data 0=300 1=300 2=3 -Split splitncnn_0 1 7 data data_splitncnn_0 data_splitncnn_1 data_splitncnn_2 data_splitncnn_3 data_splitncnn_4 data_splitncnn_5 data_splitncnn_6 -Convolution conv1 1 1 data_splitncnn_6 conv1_relu_conv1 0=64 1=3 3=2 5=1 6=1728 8=2 9=1 -Pooling pool1 1 1 conv1_relu_conv1 pool1 1=3 2=2 -Convolution fire2/squeeze1x1 1 1 pool1 fire2/squeeze1x1_fire2/relu_squeeze1x1 0=16 1=1 5=1 6=1024 8=102 9=1 -Split splitncnn_1 1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 -Convolution fire2/expand1x1 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1_fire2/relu_expand1x1 0=64 1=1 5=1 6=1024 8=2 9=1 -Convolution fire2/expand3x3 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3_fire2/relu_expand3x3 0=64 1=3 4=1 5=1 6=9216 8=2 9=1 -Concat fire2/concat 2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat -Convolution fire3/squeeze1x1 1 1 fire2/concat fire3/squeeze1x1_fire3/relu_squeeze1x1 0=16 1=1 5=1 6=2048 8=102 9=1 -Split splitncnn_2 1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 -Convolution fire3/expand1x1 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1_fire3/relu_expand1x1 0=64 1=1 5=1 6=1024 8=2 9=1 -Convolution fire3/expand3x3 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3_fire3/relu_expand3x3 0=64 1=3 4=1 5=1 6=9216 8=2 9=1 -Concat fire3/concat 2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat -Pooling pool3 1 1 fire3/concat pool3 1=3 2=2 -Convolution fire4/squeeze1x1 1 1 pool3 fire4/squeeze1x1_fire4/relu_squeeze1x1 0=32 1=1 5=1 6=4096 8=102 9=1 -Split splitncnn_3 1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 -Convolution fire4/expand1x1 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1_fire4/relu_expand1x1 0=128 1=1 5=1 6=4096 8=2 9=1 -Convolution fire4/expand3x3 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3_fire4/relu_expand3x3 0=128 1=3 4=1 5=1 6=36864 8=2 9=1 -Concat fire4/concat 2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat -Convolution fire5/squeeze1x1 1 1 fire4/concat fire5/squeeze1x1_fire5/relu_squeeze1x1 0=32 1=1 5=1 6=8192 8=102 9=1 -Split splitncnn_4 1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 -Convolution fire5/expand1x1 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1_fire5/relu_expand1x1 0=128 1=1 5=1 6=4096 8=2 9=1 -Convolution fire5/expand3x3 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3_fire5/relu_expand3x3 0=128 1=3 4=1 5=1 6=36864 8=2 9=1 -Concat fire5/concat 2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat -Split splitncnn_5 1 2 fire5/concat fire5/concat_splitncnn_0 fire5/concat_splitncnn_1 -Pooling pool5 1 1 fire5/concat_splitncnn_1 pool5 1=3 2=2 -Convolution fire6/squeeze1x1 1 1 pool5 fire6/squeeze1x1_fire6/relu_squeeze1x1 0=48 1=1 5=1 6=12288 8=102 9=1 -Split splitncnn_6 1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 -Convolution fire6/expand1x1 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1_fire6/relu_expand1x1 0=192 1=1 5=1 6=9216 8=2 9=1 -Convolution fire6/expand3x3 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3_fire6/relu_expand3x3 0=192 1=3 4=1 5=1 6=82944 8=2 9=1 -Concat fire6/concat 2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat -Convolution fire7/squeeze1x1 1 1 fire6/concat fire7/squeeze1x1_fire7/relu_squeeze1x1 0=48 1=1 5=1 6=18432 8=102 9=1 -Split splitncnn_7 1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 -Convolution fire7/expand1x1 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1_fire7/relu_expand1x1 0=192 1=1 5=1 6=9216 8=2 9=1 -Convolution fire7/expand3x3 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3_fire7/relu_expand3x3 0=192 1=3 4=1 5=1 6=82944 8=2 9=1 -Concat fire7/concat 2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat -Convolution fire8/squeeze1x1 1 1 fire7/concat fire8/squeeze1x1_fire8/relu_squeeze1x1 0=64 1=1 5=1 6=24576 8=102 9=1 -Split splitncnn_8 1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 -Convolution fire8/expand1x1 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1_fire8/relu_expand1x1 0=256 1=1 5=1 6=16384 8=2 9=1 -Convolution fire8/expand3x3 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3_fire8/relu_expand3x3 0=256 1=3 4=1 5=1 6=147456 8=2 9=1 -Concat fire8/concat 2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat -Convolution fire9/squeeze1x1 1 1 fire8/concat fire9/squeeze1x1_fire9/relu_squeeze1x1 0=64 1=1 5=1 6=32768 8=102 9=1 -Split splitncnn_9 1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 -Convolution fire9/expand1x1 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1_fire9/relu_expand1x1 0=256 1=1 5=1 6=16384 8=2 9=1 -Convolution fire9/expand3x3 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3_fire9/relu_expand3x3 0=256 1=3 4=1 5=1 6=147456 8=2 9=1 -Concat fire9/concat 2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat -Split splitncnn_10 1 4 fire9/concat fire9/concat_splitncnn_0 fire9/concat_splitncnn_1 fire9/concat_splitncnn_2 fire9/concat_splitncnn_3 -Pooling pool9 1 1 fire9/concat_splitncnn_3 pool9 1=3 2=2 -Convolution fire10/squeeze1x1 1 1 pool9 fire10/squeeze1x1_fire10/relu_squeeze1x1 0=96 1=1 5=1 6=49152 8=102 9=1 -Split splitncnn_11 1 2 fire10/squeeze1x1_fire10/relu_squeeze1x1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_0 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_1 -Convolution fire10/expand1x1 1 1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_1 fire10/expand1x1_fire10/relu_expand1x1 0=384 1=1 5=1 6=36864 8=2 9=1 -Convolution fire10/expand3x3 1 1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_0 fire10/expand3x3_fire10/relu_expand3x3 0=384 1=3 4=1 5=1 6=331776 8=2 9=1 -Concat fire10/concat 2 1 fire10/expand1x1_fire10/relu_expand1x1 fire10/expand3x3_fire10/relu_expand3x3 fire10/concat -Split splitncnn_12 1 4 fire10/concat fire10/concat_splitncnn_0 fire10/concat_splitncnn_1 fire10/concat_splitncnn_2 fire10/concat_splitncnn_3 -Pooling pool10 1 1 fire10/concat_splitncnn_3 pool10 1=3 2=2 -Convolution fire11/squeeze1x1 1 1 pool10 fire11/squeeze1x1_fire11/relu_squeeze1x1 0=96 1=1 5=1 6=73728 8=102 9=1 -Split splitncnn_13 1 2 fire11/squeeze1x1_fire11/relu_squeeze1x1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_0 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_1 -Convolution fire11/expand1x1 1 1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_1 fire11/expand1x1_fire11/relu_expand1x1 0=384 1=1 5=1 6=36864 8=2 9=1 -Convolution fire11/expand3x3 1 1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_0 fire11/expand3x3_fire11/relu_expand3x3 0=384 1=3 4=1 5=1 6=331776 8=2 9=1 -Concat fire11/concat 2 1 fire11/expand1x1_fire11/relu_expand1x1 fire11/expand3x3_fire11/relu_expand3x3 fire11/concat -Split splitncnn_14 1 4 fire11/concat fire11/concat_splitncnn_0 fire11/concat_splitncnn_1 fire11/concat_splitncnn_2 fire11/concat_splitncnn_3 -Convolution conv12_1 1 1 fire11/concat_splitncnn_3 conv12_1_conv12_1/relu 0=128 1=1 5=1 6=98304 8=102 9=1 -Convolution conv12_2 1 1 conv12_1_conv12_1/relu conv12_2_conv12_2/relu 0=256 1=3 3=2 4=1 5=1 6=294912 8=2 9=1 -Split splitncnn_15 1 4 conv12_2_conv12_2/relu conv12_2_conv12_2/relu_splitncnn_0 conv12_2_conv12_2/relu_splitncnn_1 conv12_2_conv12_2/relu_splitncnn_2 conv12_2_conv12_2/relu_splitncnn_3 -Convolution conv13_1 1 1 conv12_2_conv12_2/relu_splitncnn_3 conv13_1_conv13_1/relu 0=64 1=1 5=1 6=16384 8=102 9=1 -Convolution conv13_2 1 1 conv13_1_conv13_1/relu conv13_2_conv13_2/relu 0=128 1=3 3=2 4=1 5=1 6=73728 8=2 9=1 -Split splitncnn_16 1 3 conv13_2_conv13_2/relu conv13_2_conv13_2/relu_splitncnn_0 conv13_2_conv13_2/relu_splitncnn_1 conv13_2_conv13_2/relu_splitncnn_2 -BatchNorm fire5/bn 1 1 fire5/concat_splitncnn_0 fire5/normal_fire5/scale 0=256 -Split splitncnn_17 1 3 fire5/normal_fire5/scale fire5/normal_fire5/scale_splitncnn_0 fire5/normal_fire5/scale_splitncnn_1 fire5/normal_fire5/scale_splitncnn_2 -Convolution fire5_mbox_loc 1 1 fire5/normal_fire5/scale_splitncnn_2 fire5_mbox_loc 0=16 1=3 4=1 5=1 6=36864 8=2 -Permute fire5_mbox_loc_perm 1 1 fire5_mbox_loc fire5_mbox_loc_perm 0=3 -Flatten fire5_mbox_loc_flat 1 1 fire5_mbox_loc_perm fire5_mbox_loc_flat -Convolution fire5_mbox_conf 1 1 fire5/normal_fire5/scale_splitncnn_1 fire5_mbox_conf 0=84 1=3 4=1 5=1 6=193536 8=2 -Permute fire5_mbox_conf_perm 1 1 fire5_mbox_conf fire5_mbox_conf_perm 0=3 -Flatten fire5_mbox_conf_flat 1 1 fire5_mbox_conf_perm fire5_mbox_conf_flat -PriorBox fire5_mbox_priorbox 2 1 fire5/normal_fire5/scale_splitncnn_0 data_splitncnn_5 fire5_mbox_priorbox -23300=1,21.000000 -23301=1,45.000000 -23302=1,2.000000 9=-233 10=-233 11=8.000000 12=8.000000 13=0.500000 -Convolution fire9_mbox_loc 1 1 fire9/concat_splitncnn_2 fire9_mbox_loc 0=24 1=3 4=1 5=1 6=110592 8=2 -Permute fire9_mbox_loc_perm 1 1 fire9_mbox_loc fire9_mbox_loc_perm 0=3 -Flatten fire9_mbox_loc_flat 1 1 fire9_mbox_loc_perm fire9_mbox_loc_flat -Convolution fire9_mbox_conf 1 1 fire9/concat_splitncnn_1 fire9_mbox_conf 0=126 1=3 4=1 5=1 6=580608 8=2 -Permute fire9_mbox_conf_perm 1 1 fire9_mbox_conf fire9_mbox_conf_perm 0=3 -Flatten fire9_mbox_conf_flat 1 1 fire9_mbox_conf_perm fire9_mbox_conf_flat -PriorBox fire9_mbox_priorbox 2 1 fire9/concat_splitncnn_0 data_splitncnn_4 fire9_mbox_priorbox -23300=1,45.000000 -23301=1,99.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 11=16.000000 12=16.000000 13=0.500000 -Convolution fire10_mbox_loc 1 1 fire10/concat_splitncnn_2 fire10_mbox_loc 0=24 1=3 4=1 5=1 6=165888 8=2 -Permute fire10_mbox_loc_perm 1 1 fire10_mbox_loc fire10_mbox_loc_perm 0=3 -Flatten fire10_mbox_loc_flat 1 1 fire10_mbox_loc_perm fire10_mbox_loc_flat -Convolution fire10_mbox_conf 1 1 fire10/concat_splitncnn_1 fire10_mbox_conf 0=126 1=3 4=1 5=1 6=870912 8=2 -Permute fire10_mbox_conf_perm 1 1 fire10_mbox_conf fire10_mbox_conf_perm 0=3 -Flatten fire10_mbox_conf_flat 1 1 fire10_mbox_conf_perm fire10_mbox_conf_flat -PriorBox fire10_mbox_priorbox 2 1 fire10/concat_splitncnn_0 data_splitncnn_3 fire10_mbox_priorbox -23300=1,99.000000 -23301=1,153.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 11=32.000000 12=32.000000 13=0.500000 -Convolution fire11_mbox_loc 1 1 fire11/concat_splitncnn_2 fire11_mbox_loc 0=24 1=3 4=1 5=1 6=165888 8=2 -Permute fire11_mbox_loc_perm 1 1 fire11_mbox_loc fire11_mbox_loc_perm 0=3 -Flatten fire11_mbox_loc_flat 1 1 fire11_mbox_loc_perm fire11_mbox_loc_flat -Convolution fire11_mbox_conf 1 1 fire11/concat_splitncnn_1 fire11_mbox_conf 0=126 1=3 4=1 5=1 6=870912 8=2 -Permute fire11_mbox_conf_perm 1 1 fire11_mbox_conf fire11_mbox_conf_perm 0=3 -Flatten fire11_mbox_conf_flat 1 1 fire11_mbox_conf_perm fire11_mbox_conf_flat -PriorBox fire11_mbox_priorbox 2 1 fire11/concat_splitncnn_0 data_splitncnn_2 fire11_mbox_priorbox -23300=1,153.000000 -23301=1,207.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 11=64.000000 12=64.000000 13=0.500000 -Convolution conv12_2_mbox_loc 1 1 conv12_2_conv12_2/relu_splitncnn_2 conv12_2_mbox_loc 0=24 1=3 4=1 5=1 6=55296 8=2 -Permute conv12_2_mbox_loc_perm 1 1 conv12_2_mbox_loc conv12_2_mbox_loc_perm 0=3 -Flatten conv12_2_mbox_loc_flat 1 1 conv12_2_mbox_loc_perm conv12_2_mbox_loc_flat -Convolution conv12_2_mbox_conf 1 1 conv12_2_conv12_2/relu_splitncnn_1 conv12_2_mbox_conf 0=126 1=3 4=1 5=1 6=290304 8=2 -Permute conv12_2_mbox_conf_perm 1 1 conv12_2_mbox_conf conv12_2_mbox_conf_perm 0=3 -Flatten conv12_2_mbox_conf_flat 1 1 conv12_2_mbox_conf_perm conv12_2_mbox_conf_flat -PriorBox conv12_2_mbox_priorbox 2 1 conv12_2_conv12_2/relu_splitncnn_0 data_splitncnn_1 conv12_2_mbox_priorbox -23300=1,207.000000 -23301=1,261.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 11=100.000000 12=100.000000 13=0.500000 -Convolution conv13_2_mbox_loc 1 1 conv13_2_conv13_2/relu_splitncnn_2 conv13_2_mbox_loc 0=16 1=3 4=1 5=1 6=18432 8=2 -Permute conv13_2_mbox_loc_perm 1 1 conv13_2_mbox_loc conv13_2_mbox_loc_perm 0=3 -Flatten conv13_2_mbox_loc_flat 1 1 conv13_2_mbox_loc_perm conv13_2_mbox_loc_flat -Convolution conv13_2_mbox_conf 1 1 conv13_2_conv13_2/relu_splitncnn_1 conv13_2_mbox_conf 0=84 1=3 4=1 5=1 6=96768 8=2 -Permute conv13_2_mbox_conf_perm 1 1 conv13_2_mbox_conf conv13_2_mbox_conf_perm 0=3 -Flatten conv13_2_mbox_conf_flat 1 1 conv13_2_mbox_conf_perm conv13_2_mbox_conf_flat -PriorBox conv13_2_mbox_priorbox 2 1 conv13_2_conv13_2/relu_splitncnn_0 data_splitncnn_0 conv13_2_mbox_priorbox -23300=1,261.000000 -23301=1,315.000000 -23302=1,2.000000 9=-233 10=-233 11=300.000000 12=300.000000 13=0.500000 -Concat mbox_loc 6 1 fire5_mbox_loc_flat fire9_mbox_loc_flat fire10_mbox_loc_flat fire11_mbox_loc_flat conv12_2_mbox_loc_flat conv13_2_mbox_loc_flat mbox_loc -Concat mbox_conf 6 1 fire5_mbox_conf_flat fire9_mbox_conf_flat fire10_mbox_conf_flat fire11_mbox_conf_flat conv12_2_mbox_conf_flat conv13_2_mbox_conf_flat mbox_conf -Concat mbox_priorbox 6 1 fire5_mbox_priorbox fire9_mbox_priorbox fire10_mbox_priorbox fire11_mbox_priorbox conv12_2_mbox_priorbox conv13_2_mbox_priorbox mbox_priorbox 0=1 -Reshape mbox_conf_reshape 1 1 mbox_conf mbox_conf_reshape 0=21 1=-1 -Softmax mbox_conf_softmax 1 1 mbox_conf_reshape mbox_conf_softmax 0=1 1=1 -Flatten mbox_conf_flatten 1 1 mbox_conf_softmax mbox_conf_flatten +Input data 0 1 data -23330=4,3,300,300,3 0=300 1=300 2=3 +Split splitncnn_0 1 7 data data_splitncnn_0 data_splitncnn_1 data_splitncnn_2 data_splitncnn_3 data_splitncnn_4 data_splitncnn_5 data_splitncnn_6 -23330=28,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3 +Convolution conv1 1 1 data_splitncnn_6 conv1_relu_conv1 -23330=4,3,149,149,64 0=64 1=3 3=2 5=1 6=1728 8=2 9=1 +Pooling pool1 1 1 conv1_relu_conv1 pool1 -23330=4,3,74,74,64 1=3 2=2 +Convolution fire2/squeeze1x1 1 1 pool1 fire2/squeeze1x1_fire2/relu_squeeze1x1 -23330=4,3,74,74,16 0=16 1=1 5=1 6=1024 8=102 9=1 +Split splitncnn_1 1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 -23330=8,3,74,74,16,3,74,74,16 +Convolution fire2/expand1x1 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1_fire2/relu_expand1x1 -23330=4,3,74,74,64 0=64 1=1 5=1 6=1024 8=2 9=1 +Convolution fire2/expand3x3 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3_fire2/relu_expand3x3 -23330=4,3,74,74,64 0=64 1=3 4=1 5=1 6=9216 8=2 9=1 +Concat fire2/concat 2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat -23330=4,3,74,74,128 +Convolution fire3/squeeze1x1 1 1 fire2/concat fire3/squeeze1x1_fire3/relu_squeeze1x1 -23330=4,3,74,74,16 0=16 1=1 5=1 6=2048 8=102 9=1 +Split splitncnn_2 1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 -23330=8,3,74,74,16,3,74,74,16 +Convolution fire3/expand1x1 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1_fire3/relu_expand1x1 -23330=4,3,74,74,64 0=64 1=1 5=1 6=1024 8=2 9=1 +Convolution fire3/expand3x3 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3_fire3/relu_expand3x3 -23330=4,3,74,74,64 0=64 1=3 4=1 5=1 6=9216 8=2 9=1 +Concat fire3/concat 2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat -23330=4,3,74,74,128 +Pooling pool3 1 1 fire3/concat pool3 -23330=4,3,37,37,128 1=3 2=2 +Convolution fire4/squeeze1x1 1 1 pool3 fire4/squeeze1x1_fire4/relu_squeeze1x1 -23330=4,3,37,37,32 0=32 1=1 5=1 6=4096 8=102 9=1 +Split splitncnn_3 1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 -23330=8,3,37,37,32,3,37,37,32 +Convolution fire4/expand1x1 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1_fire4/relu_expand1x1 -23330=4,3,37,37,128 0=128 1=1 5=1 6=4096 8=2 9=1 +Convolution fire4/expand3x3 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3_fire4/relu_expand3x3 -23330=4,3,37,37,128 0=128 1=3 4=1 5=1 6=36864 8=2 9=1 +Concat fire4/concat 2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat -23330=4,3,37,37,256 +Convolution fire5/squeeze1x1 1 1 fire4/concat fire5/squeeze1x1_fire5/relu_squeeze1x1 -23330=4,3,37,37,32 0=32 1=1 5=1 6=8192 8=102 9=1 +Split splitncnn_4 1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 -23330=8,3,37,37,32,3,37,37,32 +Convolution fire5/expand1x1 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1_fire5/relu_expand1x1 -23330=4,3,37,37,128 0=128 1=1 5=1 6=4096 8=2 9=1 +Convolution fire5/expand3x3 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3_fire5/relu_expand3x3 -23330=4,3,37,37,128 0=128 1=3 4=1 5=1 6=36864 8=2 9=1 +Concat fire5/concat 2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat -23330=4,3,37,37,256 +Split splitncnn_5 1 2 fire5/concat fire5/concat_splitncnn_0 fire5/concat_splitncnn_1 -23330=8,3,37,37,256,3,37,37,256 +Pooling pool5 1 1 fire5/concat_splitncnn_1 pool5 -23330=4,3,18,18,256 1=3 2=2 +Convolution fire6/squeeze1x1 1 1 pool5 fire6/squeeze1x1_fire6/relu_squeeze1x1 -23330=4,3,18,18,48 0=48 1=1 5=1 6=12288 8=102 9=1 +Split splitncnn_6 1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 -23330=8,3,18,18,48,3,18,18,48 +Convolution fire6/expand1x1 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1_fire6/relu_expand1x1 -23330=4,3,18,18,192 0=192 1=1 5=1 6=9216 8=2 9=1 +Convolution fire6/expand3x3 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3_fire6/relu_expand3x3 -23330=4,3,18,18,192 0=192 1=3 4=1 5=1 6=82944 8=2 9=1 +Concat fire6/concat 2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat -23330=4,3,18,18,384 +Convolution fire7/squeeze1x1 1 1 fire6/concat fire7/squeeze1x1_fire7/relu_squeeze1x1 -23330=4,3,18,18,48 0=48 1=1 5=1 6=18432 8=102 9=1 +Split splitncnn_7 1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 -23330=8,3,18,18,48,3,18,18,48 +Convolution fire7/expand1x1 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1_fire7/relu_expand1x1 -23330=4,3,18,18,192 0=192 1=1 5=1 6=9216 8=2 9=1 +Convolution fire7/expand3x3 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3_fire7/relu_expand3x3 -23330=4,3,18,18,192 0=192 1=3 4=1 5=1 6=82944 8=2 9=1 +Concat fire7/concat 2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat -23330=4,3,18,18,384 +Convolution fire8/squeeze1x1 1 1 fire7/concat fire8/squeeze1x1_fire8/relu_squeeze1x1 -23330=4,3,18,18,64 0=64 1=1 5=1 6=24576 8=102 9=1 +Split splitncnn_8 1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 -23330=8,3,18,18,64,3,18,18,64 +Convolution fire8/expand1x1 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1_fire8/relu_expand1x1 -23330=4,3,18,18,256 0=256 1=1 5=1 6=16384 8=2 9=1 +Convolution fire8/expand3x3 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3_fire8/relu_expand3x3 -23330=4,3,18,18,256 0=256 1=3 4=1 5=1 6=147456 8=2 9=1 +Concat fire8/concat 2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat -23330=4,3,18,18,512 +Convolution fire9/squeeze1x1 1 1 fire8/concat fire9/squeeze1x1_fire9/relu_squeeze1x1 -23330=4,3,18,18,64 0=64 1=1 5=1 6=32768 8=102 9=1 +Split splitncnn_9 1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 -23330=8,3,18,18,64,3,18,18,64 +Convolution fire9/expand1x1 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1_fire9/relu_expand1x1 -23330=4,3,18,18,256 0=256 1=1 5=1 6=16384 8=2 9=1 +Convolution fire9/expand3x3 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3_fire9/relu_expand3x3 -23330=4,3,18,18,256 0=256 1=3 4=1 5=1 6=147456 8=2 9=1 +Concat fire9/concat 2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat -23330=4,3,18,18,512 +Split splitncnn_10 1 4 fire9/concat fire9/concat_splitncnn_0 fire9/concat_splitncnn_1 fire9/concat_splitncnn_2 fire9/concat_splitncnn_3 -23330=16,3,18,18,512,3,18,18,512,3,18,18,512,3,18,18,512 +Pooling pool9 1 1 fire9/concat_splitncnn_3 pool9 -23330=4,3,9,9,512 1=3 2=2 +Convolution fire10/squeeze1x1 1 1 pool9 fire10/squeeze1x1_fire10/relu_squeeze1x1 -23330=4,3,9,9,96 0=96 1=1 5=1 6=49152 8=102 9=1 +Split splitncnn_11 1 2 fire10/squeeze1x1_fire10/relu_squeeze1x1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_0 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_1 -23330=8,3,9,9,96,3,9,9,96 +Convolution fire10/expand1x1 1 1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_1 fire10/expand1x1_fire10/relu_expand1x1 -23330=4,3,9,9,384 0=384 1=1 5=1 6=36864 8=2 9=1 +Convolution fire10/expand3x3 1 1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_0 fire10/expand3x3_fire10/relu_expand3x3 -23330=4,3,9,9,384 0=384 1=3 4=1 5=1 6=331776 8=2 9=1 +Concat fire10/concat 2 1 fire10/expand1x1_fire10/relu_expand1x1 fire10/expand3x3_fire10/relu_expand3x3 fire10/concat -23330=4,3,9,9,768 +Split splitncnn_12 1 4 fire10/concat fire10/concat_splitncnn_0 fire10/concat_splitncnn_1 fire10/concat_splitncnn_2 fire10/concat_splitncnn_3 -23330=16,3,9,9,768,3,9,9,768,3,9,9,768,3,9,9,768 +Pooling pool10 1 1 fire10/concat_splitncnn_3 pool10 -23330=4,3,4,4,768 1=3 2=2 +Convolution fire11/squeeze1x1 1 1 pool10 fire11/squeeze1x1_fire11/relu_squeeze1x1 -23330=4,3,4,4,96 0=96 1=1 5=1 6=73728 8=102 9=1 +Split splitncnn_13 1 2 fire11/squeeze1x1_fire11/relu_squeeze1x1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_0 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_1 -23330=8,3,4,4,96,3,4,4,96 +Convolution fire11/expand1x1 1 1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_1 fire11/expand1x1_fire11/relu_expand1x1 -23330=4,3,4,4,384 0=384 1=1 5=1 6=36864 8=2 9=1 +Convolution fire11/expand3x3 1 1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_0 fire11/expand3x3_fire11/relu_expand3x3 -23330=4,3,4,4,384 0=384 1=3 4=1 5=1 6=331776 8=2 9=1 +Concat fire11/concat 2 1 fire11/expand1x1_fire11/relu_expand1x1 fire11/expand3x3_fire11/relu_expand3x3 fire11/concat -23330=4,3,4,4,768 +Split splitncnn_14 1 4 fire11/concat fire11/concat_splitncnn_0 fire11/concat_splitncnn_1 fire11/concat_splitncnn_2 fire11/concat_splitncnn_3 -23330=16,3,4,4,768,3,4,4,768,3,4,4,768,3,4,4,768 +Convolution conv12_1 1 1 fire11/concat_splitncnn_3 conv12_1_conv12_1/relu -23330=4,3,4,4,128 0=128 1=1 5=1 6=98304 8=102 9=1 +Convolution conv12_2 1 1 conv12_1_conv12_1/relu conv12_2_conv12_2/relu -23330=4,3,2,2,256 0=256 1=3 3=2 4=1 5=1 6=294912 8=2 9=1 +Split splitncnn_15 1 4 conv12_2_conv12_2/relu conv12_2_conv12_2/relu_splitncnn_0 conv12_2_conv12_2/relu_splitncnn_1 conv12_2_conv12_2/relu_splitncnn_2 conv12_2_conv12_2/relu_splitncnn_3 -23330=16,3,2,2,256,3,2,2,256,3,2,2,256,3,2,2,256 +Convolution conv13_1 1 1 conv12_2_conv12_2/relu_splitncnn_3 conv13_1_conv13_1/relu -23330=4,3,2,2,64 0=64 1=1 5=1 6=16384 8=102 9=1 +Convolution conv13_2 1 1 conv13_1_conv13_1/relu conv13_2_conv13_2/relu -23330=4,3,1,1,128 0=128 1=3 3=2 4=1 5=1 6=73728 8=2 9=1 +Split splitncnn_16 1 3 conv13_2_conv13_2/relu conv13_2_conv13_2/relu_splitncnn_0 conv13_2_conv13_2/relu_splitncnn_1 conv13_2_conv13_2/relu_splitncnn_2 -23330=12,3,1,1,128,3,1,1,128,3,1,1,128 +BatchNorm fire5/bn 1 1 fire5/concat_splitncnn_0 fire5/normal_fire5/scale -23330=4,3,37,37,256 0=256 +Split splitncnn_17 1 3 fire5/normal_fire5/scale fire5/normal_fire5/scale_splitncnn_0 fire5/normal_fire5/scale_splitncnn_1 fire5/normal_fire5/scale_splitncnn_2 -23330=12,3,37,37,256,3,37,37,256,3,37,37,256 +Convolution fire5_mbox_loc 1 1 fire5/normal_fire5/scale_splitncnn_2 fire5_mbox_loc -23330=4,3,37,37,16 0=16 1=3 4=1 5=1 6=36864 8=2 +Permute fire5_mbox_loc_perm 1 1 fire5_mbox_loc fire5_mbox_loc_perm -23330=4,3,16,37,37 0=3 +Flatten fire5_mbox_loc_flat 1 1 fire5_mbox_loc_perm fire5_mbox_loc_flat -23330=4,1,21904,1,1 +Convolution fire5_mbox_conf 1 1 fire5/normal_fire5/scale_splitncnn_1 fire5_mbox_conf -23330=4,3,37,37,84 0=84 1=3 4=1 5=1 6=193536 8=2 +Permute fire5_mbox_conf_perm 1 1 fire5_mbox_conf fire5_mbox_conf_perm -23330=4,3,84,37,37 0=3 +Flatten fire5_mbox_conf_flat 1 1 fire5_mbox_conf_perm fire5_mbox_conf_flat -23330=4,1,114996,1,1 +PriorBox fire5_mbox_priorbox 2 1 fire5/normal_fire5/scale_splitncnn_0 data_splitncnn_5 fire5_mbox_priorbox -23330=4,2,21904,2,1 -23300=1,21.000000 -23301=1,45.000000 -23302=1,2.000000 9=-233 10=-233 11=8.000000 12=8.000000 13=0.500000 +Convolution fire9_mbox_loc 1 1 fire9/concat_splitncnn_2 fire9_mbox_loc -23330=4,3,18,18,24 0=24 1=3 4=1 5=1 6=110592 8=2 +Permute fire9_mbox_loc_perm 1 1 fire9_mbox_loc fire9_mbox_loc_perm -23330=4,3,24,18,18 0=3 +Flatten fire9_mbox_loc_flat 1 1 fire9_mbox_loc_perm fire9_mbox_loc_flat -23330=4,1,7776,1,1 +Convolution fire9_mbox_conf 1 1 fire9/concat_splitncnn_1 fire9_mbox_conf -23330=4,3,18,18,126 0=126 1=3 4=1 5=1 6=580608 8=2 +Permute fire9_mbox_conf_perm 1 1 fire9_mbox_conf fire9_mbox_conf_perm -23330=4,3,126,18,18 0=3 +Flatten fire9_mbox_conf_flat 1 1 fire9_mbox_conf_perm fire9_mbox_conf_flat -23330=4,1,40824,1,1 +PriorBox fire9_mbox_priorbox 2 1 fire9/concat_splitncnn_0 data_splitncnn_4 fire9_mbox_priorbox -23330=4,2,7776,2,1 -23300=1,45.000000 -23301=1,99.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 11=16.000000 12=16.000000 13=0.500000 +Convolution fire10_mbox_loc 1 1 fire10/concat_splitncnn_2 fire10_mbox_loc -23330=4,3,9,9,24 0=24 1=3 4=1 5=1 6=165888 8=2 +Permute fire10_mbox_loc_perm 1 1 fire10_mbox_loc fire10_mbox_loc_perm -23330=4,3,24,9,9 0=3 +Flatten fire10_mbox_loc_flat 1 1 fire10_mbox_loc_perm fire10_mbox_loc_flat -23330=4,1,1944,1,1 +Convolution fire10_mbox_conf 1 1 fire10/concat_splitncnn_1 fire10_mbox_conf -23330=4,3,9,9,126 0=126 1=3 4=1 5=1 6=870912 8=2 +Permute fire10_mbox_conf_perm 1 1 fire10_mbox_conf fire10_mbox_conf_perm -23330=4,3,126,9,9 0=3 +Flatten fire10_mbox_conf_flat 1 1 fire10_mbox_conf_perm fire10_mbox_conf_flat -23330=4,1,10206,1,1 +PriorBox fire10_mbox_priorbox 2 1 fire10/concat_splitncnn_0 data_splitncnn_3 fire10_mbox_priorbox -23330=4,2,1944,2,1 -23300=1,99.000000 -23301=1,153.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 11=32.000000 12=32.000000 13=0.500000 +Convolution fire11_mbox_loc 1 1 fire11/concat_splitncnn_2 fire11_mbox_loc -23330=4,3,4,4,24 0=24 1=3 4=1 5=1 6=165888 8=2 +Permute fire11_mbox_loc_perm 1 1 fire11_mbox_loc fire11_mbox_loc_perm -23330=4,3,24,4,4 0=3 +Flatten fire11_mbox_loc_flat 1 1 fire11_mbox_loc_perm fire11_mbox_loc_flat -23330=4,1,384,1,1 +Convolution fire11_mbox_conf 1 1 fire11/concat_splitncnn_1 fire11_mbox_conf -23330=4,3,4,4,126 0=126 1=3 4=1 5=1 6=870912 8=2 +Permute fire11_mbox_conf_perm 1 1 fire11_mbox_conf fire11_mbox_conf_perm -23330=4,3,126,4,4 0=3 +Flatten fire11_mbox_conf_flat 1 1 fire11_mbox_conf_perm fire11_mbox_conf_flat -23330=4,1,2016,1,1 +PriorBox fire11_mbox_priorbox 2 1 fire11/concat_splitncnn_0 data_splitncnn_2 fire11_mbox_priorbox -23330=4,2,384,2,1 -23300=1,153.000000 -23301=1,207.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 11=64.000000 12=64.000000 13=0.500000 +Convolution conv12_2_mbox_loc 1 1 conv12_2_conv12_2/relu_splitncnn_2 conv12_2_mbox_loc -23330=4,3,2,2,24 0=24 1=3 4=1 5=1 6=55296 8=2 +Permute conv12_2_mbox_loc_perm 1 1 conv12_2_mbox_loc conv12_2_mbox_loc_perm -23330=4,3,24,2,2 0=3 +Flatten conv12_2_mbox_loc_flat 1 1 conv12_2_mbox_loc_perm conv12_2_mbox_loc_flat -23330=4,1,96,1,1 +Convolution conv12_2_mbox_conf 1 1 conv12_2_conv12_2/relu_splitncnn_1 conv12_2_mbox_conf -23330=4,3,2,2,126 0=126 1=3 4=1 5=1 6=290304 8=2 +Permute conv12_2_mbox_conf_perm 1 1 conv12_2_mbox_conf conv12_2_mbox_conf_perm -23330=4,3,126,2,2 0=3 +Flatten conv12_2_mbox_conf_flat 1 1 conv12_2_mbox_conf_perm conv12_2_mbox_conf_flat -23330=4,1,504,1,1 +PriorBox conv12_2_mbox_priorbox 2 1 conv12_2_conv12_2/relu_splitncnn_0 data_splitncnn_1 conv12_2_mbox_priorbox -23330=4,2,96,2,1 -23300=1,207.000000 -23301=1,261.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 11=100.000000 12=100.000000 13=0.500000 +Convolution conv13_2_mbox_loc 1 1 conv13_2_conv13_2/relu_splitncnn_2 conv13_2_mbox_loc -23330=4,3,1,1,16 0=16 1=3 4=1 5=1 6=18432 8=2 +Permute conv13_2_mbox_loc_perm 1 1 conv13_2_mbox_loc conv13_2_mbox_loc_perm -23330=4,3,16,1,1 0=3 +Flatten conv13_2_mbox_loc_flat 1 1 conv13_2_mbox_loc_perm conv13_2_mbox_loc_flat -23330=4,1,16,1,1 +Convolution conv13_2_mbox_conf 1 1 conv13_2_conv13_2/relu_splitncnn_1 conv13_2_mbox_conf -23330=4,3,1,1,84 0=84 1=3 4=1 5=1 6=96768 8=2 +Permute conv13_2_mbox_conf_perm 1 1 conv13_2_mbox_conf conv13_2_mbox_conf_perm -23330=4,3,84,1,1 0=3 +Flatten conv13_2_mbox_conf_flat 1 1 conv13_2_mbox_conf_perm conv13_2_mbox_conf_flat -23330=4,1,84,1,1 +PriorBox conv13_2_mbox_priorbox 2 1 conv13_2_conv13_2/relu_splitncnn_0 data_splitncnn_0 conv13_2_mbox_priorbox -23330=4,2,16,2,1 -23300=1,261.000000 -23301=1,315.000000 -23302=1,2.000000 9=-233 10=-233 11=300.000000 12=300.000000 13=0.500000 +Concat mbox_loc 6 1 fire5_mbox_loc_flat fire9_mbox_loc_flat fire10_mbox_loc_flat fire11_mbox_loc_flat conv12_2_mbox_loc_flat conv13_2_mbox_loc_flat mbox_loc -23330=4,1,32120,1,1 +Concat mbox_conf 6 1 fire5_mbox_conf_flat fire9_mbox_conf_flat fire10_mbox_conf_flat fire11_mbox_conf_flat conv12_2_mbox_conf_flat conv13_2_mbox_conf_flat mbox_conf -23330=4,1,168630,1,1 +Concat mbox_priorbox 6 1 fire5_mbox_priorbox fire9_mbox_priorbox fire10_mbox_priorbox fire11_mbox_priorbox conv12_2_mbox_priorbox conv13_2_mbox_priorbox mbox_priorbox -23330=4,2,32120,2,1 0=1 +Reshape mbox_conf_reshape 1 1 mbox_conf mbox_conf_reshape -23330=4,2,21,8030,1 0=21 1=-1 +Softmax mbox_conf_softmax 1 1 mbox_conf_reshape mbox_conf_softmax -23330=4,2,21,8030,1 0=1 1=1 +Flatten mbox_conf_flatten 1 1 mbox_conf_softmax mbox_conf_flatten -23330=4,1,168630,1,1 DetectionOutput detection_out 3 1 mbox_loc mbox_conf_flatten mbox_priorbox output 0=21 1=0.450000 2=100 4=0.250000 diff --git a/benchmark/models/vgg16_int8.param b/benchmark/models/vgg16_int8.param index fa2aff591ada..37159f47ce98 100644 --- a/benchmark/models/vgg16_int8.param +++ b/benchmark/models/vgg16_int8.param @@ -1,25 +1,25 @@ 7767517 23 23 -Input data 0 1 data 0=224 1=224 2=3 -Convolution conv1_1 1 1 data conv1_1_relu1_1 0=64 1=3 4=1 5=1 6=1728 8=102 9=1 -Convolution conv1_2 1 1 conv1_1_relu1_1 conv1_2_relu1_2 0=64 1=3 4=1 5=1 6=36864 8=2 9=1 -Pooling pool1 1 1 conv1_2_relu1_2 pool1 1=2 2=2 -Convolution conv2_1 1 1 pool1 conv2_1_relu2_1 0=128 1=3 4=1 5=1 6=73728 8=102 9=1 -Convolution conv2_2 1 1 conv2_1_relu2_1 conv2_2_relu2_2 0=128 1=3 4=1 5=1 6=147456 8=2 9=1 -Pooling pool2 1 1 conv2_2_relu2_2 pool2 1=2 2=2 -Convolution conv3_1 1 1 pool2 conv3_1_relu3_1 0=256 1=3 4=1 5=1 6=294912 8=102 9=1 -Convolution conv3_2 1 1 conv3_1_relu3_1 conv3_2_relu3_2 0=256 1=3 4=1 5=1 6=589824 8=102 9=1 -Convolution conv3_3 1 1 conv3_2_relu3_2 conv3_3_relu3_3 0=256 1=3 4=1 5=1 6=589824 8=2 9=1 -Pooling pool3 1 1 conv3_3_relu3_3 pool3 1=2 2=2 -Convolution conv4_1 1 1 pool3 conv4_1_relu4_1 0=512 1=3 4=1 5=1 6=1179648 8=102 9=1 -Convolution conv4_2 1 1 conv4_1_relu4_1 conv4_2_relu4_2 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1 -Convolution conv4_3 1 1 conv4_2_relu4_2 conv4_3_relu4_3 0=512 1=3 4=1 5=1 6=2359296 8=2 9=1 -Pooling pool4 1 1 conv4_3_relu4_3 pool4 1=2 2=2 -Convolution conv5_1 1 1 pool4 conv5_1_relu5_1 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1 -Convolution conv5_2 1 1 conv5_1_relu5_1 conv5_2_relu5_2 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1 -Convolution conv5_3 1 1 conv5_2_relu5_2 conv5_3_relu5_3 0=512 1=3 4=1 5=1 6=2359296 8=2 9=1 -Pooling pool5 1 1 conv5_3_relu5_3 pool5 1=2 2=2 -InnerProduct fc6 1 1 pool5 fc6_drop6 0=4096 1=1 2=102760448 8=2 9=1 -InnerProduct fc7 1 1 fc6_drop6 fc7_drop7 0=4096 1=1 2=16777216 8=2 9=1 -InnerProduct fc8 1 1 fc7_drop7 fc8 0=1000 1=1 2=4096000 8=2 -Softmax prob 1 1 fc8 output +Input data 0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3 +Convolution conv1_1 1 1 data conv1_1_relu1_1 -23330=4,3,224,224,64 0=64 1=3 4=1 5=1 6=1728 8=102 9=1 +Convolution conv1_2 1 1 conv1_1_relu1_1 conv1_2_relu1_2 -23330=4,3,224,224,64 0=64 1=3 4=1 5=1 6=36864 8=2 9=1 +Pooling pool1 1 1 conv1_2_relu1_2 pool1 -23330=4,3,112,112,64 1=2 2=2 +Convolution conv2_1 1 1 pool1 conv2_1_relu2_1 -23330=4,3,112,112,128 0=128 1=3 4=1 5=1 6=73728 8=102 9=1 +Convolution conv2_2 1 1 conv2_1_relu2_1 conv2_2_relu2_2 -23330=4,3,112,112,128 0=128 1=3 4=1 5=1 6=147456 8=2 9=1 +Pooling pool2 1 1 conv2_2_relu2_2 pool2 -23330=4,3,56,56,128 1=2 2=2 +Convolution conv3_1 1 1 pool2 conv3_1_relu3_1 -23330=4,3,56,56,256 0=256 1=3 4=1 5=1 6=294912 8=102 9=1 +Convolution conv3_2 1 1 conv3_1_relu3_1 conv3_2_relu3_2 -23330=4,3,56,56,256 0=256 1=3 4=1 5=1 6=589824 8=102 9=1 +Convolution conv3_3 1 1 conv3_2_relu3_2 conv3_3_relu3_3 -23330=4,3,56,56,256 0=256 1=3 4=1 5=1 6=589824 8=2 9=1 +Pooling pool3 1 1 conv3_3_relu3_3 pool3 -23330=4,3,28,28,256 1=2 2=2 +Convolution conv4_1 1 1 pool3 conv4_1_relu4_1 -23330=4,3,28,28,512 0=512 1=3 4=1 5=1 6=1179648 8=102 9=1 +Convolution conv4_2 1 1 conv4_1_relu4_1 conv4_2_relu4_2 -23330=4,3,28,28,512 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1 +Convolution conv4_3 1 1 conv4_2_relu4_2 conv4_3_relu4_3 -23330=4,3,28,28,512 0=512 1=3 4=1 5=1 6=2359296 8=2 9=1 +Pooling pool4 1 1 conv4_3_relu4_3 pool4 -23330=4,3,14,14,512 1=2 2=2 +Convolution conv5_1 1 1 pool4 conv5_1_relu5_1 -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1 +Convolution conv5_2 1 1 conv5_1_relu5_1 conv5_2_relu5_2 -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1 +Convolution conv5_3 1 1 conv5_2_relu5_2 conv5_3_relu5_3 -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=2359296 8=2 9=1 +Pooling pool5 1 1 conv5_3_relu5_3 pool5 -23330=4,3,7,7,512 1=2 2=2 +InnerProduct fc6 1 1 pool5 fc6_drop6 -23330=4,1,4096,1,1 0=4096 1=1 2=102760448 8=2 9=1 +InnerProduct fc7 1 1 fc6_drop6 fc7_drop7 -23330=4,1,4096,1,1 0=4096 1=1 2=16777216 8=2 9=1 +InnerProduct fc8 1 1 fc7_drop7 fc8 -23330=4,1,1000,1,1 0=1000 1=1 2=4096000 8=2 +Softmax prob 1 1 fc8 output -23330=4,1,1000,1,1 diff --git a/docs/developer-guide/glsl-extension.md b/docs/developer-guide/glsl-extension.md index 110fa4fafa4b..6fad73c4a539 100644 --- a/docs/developer-guide/glsl-extension.md +++ b/docs/developer-guide/glsl-extension.md @@ -166,6 +166,41 @@ shared lfp tmp_a[8][4][2]; |lfp|float|float|float|float16_t|float|bfloat16_t| |lfpvec4|vec4|uvec2|uint64_t|f16vec4|uvec2|bf16vec4| +## integer type + +declare int8/int16 buffer data layout and local variables in glsl code + +```c +layout (binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_data[]; }; +layout (binding = 1) readonly buffer weight_blob { sint16 weight_blob_data[]; }; +``` + +|int8 storage type|int8p|int8s|int8s+int8a| +|---|---|---|---| +|sint8|int|int8_t|int8_t| +|sint8vec4|int|int|int| + +|int8 arithmetic type|int8| +|---|---| +|aint8|int| +|aint8vec4|ivec4| + +|int16 arithmetic type|int16| +|---|---| +|aint16|int16_t when shaderInt16 is available, otherwise int| +|aint16vec4|i16vec4 when shaderInt16 is available, otherwise ivec4| + +|int16 storage/local type|int16p|int16s| +|---|---|---| +|sint16|int|int16_t| +|sint16vec4|ivec2|i16vec4| +|lint16|int|int16_t| +|lint16vec4|ivec2|i16vec4| + +`sint8vec4` uses one `int` to hold four signed int8 lanes in all int8 storage modes. This keeps pack4 data in packed form for integer dot-product and shared-memory paths. Use `i8buffer_ld4` to unpack it to `ivec4`, and use `i8buffer_sm4` to load the raw packed `int`. + +`sint16` uses one `int` to hold two signed int16 lanes when `opt.use_int16_packed` is enabled, and uses native `int16_t` when `opt.use_int16_storage` is enabled. `sint16vec4` stores four logical int16 lanes as two packed `int` values in int16p mode and as native `i16vec4` in int16s mode. `lint16` and `lint16vec4` are the shared/local-memory counterparts. + # buffer functions - load typed value from src[offset] @@ -203,6 +238,77 @@ void buffer_cp1to4(sfpvec4 dst, int dst_offset, sfp src, ivec4 src_offsets); ```c void buffer_cp4to1(sfp dst, ivec4 dst_offsets, sfpvec4 src, int src_offset); ``` + +# integer buffer functions + +- load integer typed value from src[offset] + +```c +aint8 i8buffer_ld1(sint8 src, int offset); +aint8vec4 i8buffer_ld4(sint8vec4 src, int offset); +int i8buffer_sm4(sint8vec4 src, int offset); +int i16buffer_ld1(sint16 src, int offset); +ivec2 i16buffer_ld2(sint16 src, int offset); +sint16vec4 i16buffer_sm4(sint16vec4 src, int offset); +aint16vec4 i16buffer_ld4(sint16vec4 src, int offset); +aint16 lint162aint16(lint16 v); +aint16vec4 lint162aint16vec4(lint16vec4 v); +``` + +`i8buffer_sm4` loads the raw packed `int` representation of four int8 lanes. It is useful for shared-memory staging and `dotPacked4x8EXT` paths where unpacking to `ivec4` would be wasteful. + +`i16buffer_ld1` and `i16buffer_ld2` load signed int16 lanes as `int` and `ivec2`. Without native int16 storage, `offset` is still the logical int16 lane offset, and packed storage groups two adjacent lanes in one `int`. + +`i16buffer_sm4` loads the raw `sint16vec4` representation of four logical int16 lanes from buffer storage. `i16buffer_ld4` loads four logical int16 lanes from buffer storage as `aint16vec4`. `lint162aint16` and `lint162aint16vec4` convert shared/local int16 values to arithmetic int16 values. + +- store integer typed value to dst[offset] + +```c +void i8buffer_st1(sint8 dst, int offset, aint8 v); +void i8buffer_st4(sint8vec4 dst, int offset, aint8vec4 v); +void i16buffer_st1(sint16 dst, int offset, int v); +void i16buffer_st2(sint16 dst, int offset, ivec2 v); +void i16buffer_st4(sint16vec4 dst, int offset, ivec4 v); +void i16buffer_st4(lint16vec4 dst, int offset, ivec4 v); +``` + +Without native int8 storage, `i8buffer_st1` updates one byte lane inside a packed `int` and may use an atomic compare-and-swap loop. + +Without native int16 storage, `i16buffer_st1` updates one int16 lane inside a packed `int` and may use an atomic compare-and-swap loop. `i16buffer_st2` stores complete packed words directly when `offset` is aligned. `i16buffer_st4` writes four logical int16 lanes to `sint16vec4` storage or `lint16vec4` shared/local memory. + +- copy int8 typed value from src[src_offset] to dst[dst_offset] + +```c +void i8buffer_cp1(sint8 dst, int dst_offset, sint8 src, int src_offset); +void i8buffer_cp4(sint8vec4 dst, int dst_offset, sint8vec4 src, int src_offset); +``` + +- copy and pack int8 typed values from src[src_offsets[0],src_offsets[1],...] to dst[dst_offset] + +```c +void i8buffer_cp1to4(sint8vec4 dst, int dst_offset, sint8 src, ivec4 src_offsets); +``` + +- copy and unpack int8 typed values from src[src_offset] to dst[dst_offsets[0],dst_offsets[1],...] + +```c +void i8buffer_cp4to1(sint8 dst, ivec4 dst_offsets, sint8vec4 src, int src_offset); +``` + +- pack and unpack signed integer lanes + +```c +ivec4 unpackInt4x8(int v); +int packInt4x8(ivec4 v); +ivec2 unpackInt2x16(int v); +int packInt2x16(ivec2 v); +int float2int8(float v); +ivec4 float2int8vec4(vec4 v); +``` + +`packInt4x8` stores `.r/.g/.b/.a` in the low-to-high bytes of one `int`. `packInt2x16` stores `.r/.g` in the low-to-high 16-bit lanes of one `int`. +`float2int8` and `float2int8vec4` round half away from zero and saturate to [-127, 127] for deterministic int8 quantization. + # local data conversion functions - storage buffer to local memory @@ -314,6 +420,16 @@ void main() // here is the code path optimized for subgroup_size == 32 #endif +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated + // here is the packed int8 dot-product path +#endif + +#if ncnn_VK_KHR_cooperative_matrix + // here is the KHR cooperative matrix path +#elif ncnn_VK_NV_cooperative_matrix + // here is the NV cooperative matrix path +#endif + // use macro definitions uint size; // dynamic value from some previous routines if (size < ncnn_subgroupSize) @@ -329,6 +445,12 @@ void main() } ``` +Cooperative matrix shape and component-type combinations are selected on the host side. Use `GpuInfo::support_cooperative_matrix()`, `GpuInfo::support_int8_cooperative_matrix()`, `GpuInfo::support_bf16_cooperative_matrix()`, and `GpuInfo::get_optimal_cooperative_matrix_mnk()` before creating a cooperative matrix pipeline. + +For signed int8 cooperative matrix kernels, ncnn requires signed int8 A/B and signed int32 accumulator/result cooperative matrix support at subgroup scope. The shader still uses the normal `ncnn_VK_KHR_cooperative_matrix` / `ncnn_VK_NV_cooperative_matrix` extension macros to select GLSL syntax, while the host selects this path with `support_int8_cooperative_matrix()`. + +In int8 cooperative matrix and integer dot-product shaders, prefer keeping pack4 data in the packed `sint8vec4` representation and use `i8buffer_sm4` for shared-memory staging when the layout is already packed. Use `i8buffer_ld4` only when arithmetic needs unpacked `ivec4` lanes. + ### validation layer macros ncnn will define some additional convenient macros when the vulkan validation layer enabled @@ -357,7 +479,7 @@ At runtime, `NCNN_LOGE` will print out the value of `gx` enable glsl extension only if user enable some options -The `GL_EXT_shader_16bit_storage` extension will be automatically enabled without explicit code indication when the device supports 16-bit storage and the user turns on `opt.use_fp16_storage` or `opt.use_bf16_storage` +The `GL_EXT_shader_16bit_storage` extension will be automatically enabled without explicit code indication when the device supports 16-bit storage and the user turns on `opt.use_fp16_storage`, `opt.use_bf16_storage`, or `opt.use_int16_storage` The `GL_EXT_shader_explicit_arithmetic_types_float16` extension will be automatically enabled without explicit code indication when the device supports 16-bit arithmetic and the user turns on `opt.use_fp16_arithmetic` @@ -388,6 +510,10 @@ void main() |NCNN_int8_packed|opt.use_int8_packed| |NCNN_int8_storage|opt.use_int8_storage| |NCNN_int8_arithmetic|opt.use_int8_arithmetic| +|NCNN_int16_packed|opt.use_int16_packed| +|NCNN_int16_storage|opt.use_int16_storage| |NCNN_bf16_packed|opt.use_bf16_packed| |NCNN_bf16_storage|opt.use_bf16_storage| +|NCNN_fp16_uniform|opt.use_fp16_uniform| +|NCNN_int8_uniform|opt.use_int8_uniform| |NCNN_shader_local_memory|opt.use_shader_local_memory| diff --git a/docs/developer-guide/glsl-extension.zh.md b/docs/developer-guide/glsl-extension.zh.md index c586c784635e..290f8be65bae 100644 --- a/docs/developer-guide/glsl-extension.zh.md +++ b/docs/developer-guide/glsl-extension.zh.md @@ -166,6 +166,41 @@ shared lfp tmp_a[8][4][2]; |lfp|float|float|float|float16_t|float|bfloat16_t| |lfpvec4|vec4|uvec2|uint64_t|f16vec4|uvec2|bf16vec4| +## 整数类型(integer type) + +在 GLSL 代码中声明 int8/int16 缓冲区数据布局和局部变量 + +```c +layout (binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_data[]; }; +layout (binding = 1) readonly buffer weight_blob { sint16 weight_blob_data[]; }; +``` + +|int8 存储类型|int8p|int8s|int8s+int8a| +|---|---|---|---| +|sint8|int|int8_t|int8_t| +|sint8vec4|int|int|int| + +|int8 算术类型|int8| +|---|---| +|aint8|int| +|aint8vec4|ivec4| + +|int16 算术类型|int16| +|---|---| +|aint16|shaderInt16 可用时为 int16_t,否则为 int| +|aint16vec4|shaderInt16 可用时为 i16vec4,否则为 ivec4| + +|int16 存储/local 类型|int16p|int16s| +|---|---|---| +|sint16|int|int16_t| +|sint16vec4|ivec2|i16vec4| +|lint16|int|int16_t| +|lint16vec4|ivec2|i16vec4| + +`sint8vec4` 在所有 int8 存储模式下都使用一个 `int` 保存四个有符号 int8 lane。这样可以让 pack4 数据在 integer dot product 和 shared memory 路径中保持 packed 形式。使用 `i8buffer_ld4` 解包为 `ivec4`,使用 `i8buffer_sm4` 直接加载 packed `int`。 + +启用 `opt.use_int16_packed` 时,`sint16` 使用一个 `int` 保存两个有符号 int16 lane;启用 `opt.use_int16_storage` 时,`sint16` 使用原生 `int16_t`。`sint16vec4` 在 int16p 模式下使用两个 packed `int` 保存四个逻辑 int16 lane,在 int16s 模式下使用原生 `i16vec4`。`lint16` 和 `lint16vec4` 是 shared/local memory 对应类型。 + # 缓冲区函数(buffer functions) - 从 src[offset] 加载已经确定类型的值 @@ -204,6 +239,76 @@ void buffer_cp1to4(sfpvec4 dst, int dst_offset, sfp src, ivec4 src_offsets); void buffer_cp4to1(sfp dst, ivec4 dst_offsets, sfpvec4 src, int src_offset); ``` +# 整数缓冲区函数(integer buffer functions) + +- 从 src[offset] 加载整数类型的值 + +```c +aint8 i8buffer_ld1(sint8 src, int offset); +aint8vec4 i8buffer_ld4(sint8vec4 src, int offset); +int i8buffer_sm4(sint8vec4 src, int offset); +int i16buffer_ld1(sint16 src, int offset); +ivec2 i16buffer_ld2(sint16 src, int offset); +sint16vec4 i16buffer_sm4(sint16vec4 src, int offset); +aint16vec4 i16buffer_ld4(sint16vec4 src, int offset); +aint16 lint162aint16(lint16 v); +aint16vec4 lint162aint16vec4(lint16vec4 v); +``` + +`i8buffer_sm4` 加载四个 int8 lane 的原始 packed `int` 表示。它适用于 shared memory 暂存和 `dotPacked4x8EXT` 路径,避免先解包成 `ivec4` 再重新打包。 + +`i16buffer_ld1` 和 `i16buffer_ld2` 将有符号 int16 lane 加载为 `int` 和 `ivec2`。没有原生 int16 storage 时,`offset` 仍表示逻辑 int16 lane 偏移,packed storage 会把相邻两个 lane 放在一个 `int` 中。 + +`i16buffer_sm4` 从 buffer storage 加载四个逻辑 int16 lane 的原始 `sint16vec4` 表示。`i16buffer_ld4` 从 buffer storage 将四个逻辑 int16 lane 加载为 `aint16vec4`。`lint162aint16` 和 `lint162aint16vec4` 将 shared/local int16 值转换为算术 int16 值。 + +- 将整数类型的值存储到 dst[offset] + +```c +void i8buffer_st1(sint8 dst, int offset, aint8 v); +void i8buffer_st4(sint8vec4 dst, int offset, aint8vec4 v); +void i16buffer_st1(sint16 dst, int offset, int v); +void i16buffer_st2(sint16 dst, int offset, ivec2 v); +void i16buffer_st4(sint16vec4 dst, int offset, ivec4 v); +void i16buffer_st4(lint16vec4 dst, int offset, ivec4 v); +``` + +没有原生 int8 storage 时,`i8buffer_st1` 会更新 packed `int` 中的一个 byte lane,可能使用 atomic compare-and-swap 循环。 + +没有原生 int16 storage 时,`i16buffer_st1` 会更新 packed `int` 中的一个 int16 lane,可能使用 atomic compare-and-swap 循环。`i16buffer_st2` 在 `offset` 对齐时会直接写入完整 packed word。`i16buffer_st4` 将四个逻辑 int16 lane 写入 `sint16vec4` 存储或 `lint16vec4` shared/local memory。 + +- 从 src[src_offset] 的 int8 类型值拷贝到 dst[dst_offset] + +```c +void i8buffer_cp1(sint8 dst, int dst_offset, sint8 src, int src_offset); +void i8buffer_cp4(sint8vec4 dst, int dst_offset, sint8vec4 src, int src_offset); +``` + +- 从 src[src_offsets[0],src_offsets[1],...] 的 int8 类型值拷贝并打包到 dst[dst_offset] + +```c +void i8buffer_cp1to4(sint8vec4 dst, int dst_offset, sint8 src, ivec4 src_offsets); +``` + +- 从 src[src_offset] 的 int8 类型值拷贝并解包到 dst[dst_offsets[0],dst_offsets[1],...] + +```c +void i8buffer_cp4to1(sint8 dst, ivec4 dst_offsets, sint8vec4 src, int src_offset); +``` + +- 打包和解包有符号整数 lane + +```c +ivec4 unpackInt4x8(int v); +int packInt4x8(ivec4 v); +ivec2 unpackInt2x16(int v); +int packInt2x16(ivec2 v); +int float2int8(float v); +ivec4 float2int8vec4(vec4 v); +``` + +`packInt4x8` 将 `.r/.g/.b/.a` 按低字节到高字节保存到一个 `int`。`packInt2x16` 将 `.r/.g` 按低 16-bit lane 到高 16-bit lane 保存到一个 `int`。 +`float2int8` 和 `float2int8vec4` 使用 half-away-from-zero 规则,并饱和到 [-127, 127],用于确定性的 int8 量化。 + # 本地数据转换函数 - 存储缓冲区转换到本地内存 @@ -315,6 +420,16 @@ void main() // 为 subgroup_size == 32 优化的代码路径 #endif +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated + // packed int8 dot-product 路径 +#endif + +#if ncnn_VK_KHR_cooperative_matrix + // KHR cooperative matrix 路径 +#elif ncnn_VK_NV_cooperative_matrix + // NV cooperative matrix 路径 +#endif + // 使用宏定义 uint size; // 来自先前例程的动态值 if (size < ncnn_subgroupSize) @@ -330,6 +445,12 @@ void main() } ``` +Cooperative matrix 的形状和 component type 组合在 host 侧选择。创建 cooperative matrix pipeline 前,应使用 `GpuInfo::support_cooperative_matrix()`、`GpuInfo::support_int8_cooperative_matrix()`、`GpuInfo::support_bf16_cooperative_matrix()` 和 `GpuInfo::get_optimal_cooperative_matrix_mnk()` 判断能力并选择参数。 + +对于有符号 int8 cooperative matrix kernel,ncnn 要求设备支持 subgroup scope 下的 signed int8 A/B 和 signed int32 accumulator/result cooperative matrix。shader 仍然使用普通的 `ncnn_VK_KHR_cooperative_matrix` / `ncnn_VK_NV_cooperative_matrix` 扩展宏选择 GLSL 语法,host 侧通过 `support_int8_cooperative_matrix()` 选择该路径。 + +在 int8 cooperative matrix 和 integer dot-product shader 中,如果数据布局已经是 packed 形式,应优先保持 `sint8vec4` 的 packed 表示,并使用 `i8buffer_sm4` 做 shared memory 暂存。只有在算术逻辑需要解包后的 `ivec4` lane 时才使用 `i8buffer_ld4`。 + ### 验证层宏定义 当启用 vulkan 验证层时,ncnn 会定义一些额外的便捷宏 @@ -358,7 +479,7 @@ void main() 仅当用户启用某些选项时才启用 GLSL 扩展 -`GL_EXT_shader_16bit_storage` 扩展会在设备支持 16 位存储且用户开启了 `opt.use_fp16_storage` 或 `opt.use_bf16_storage` 选项时,自动启用,无需显式代码指示。 +`GL_EXT_shader_16bit_storage` 扩展会在设备支持 16 位存储且用户开启了 `opt.use_fp16_storage`、`opt.use_bf16_storage` 或 `opt.use_int16_storage` 选项时,自动启用,无需显式代码指示。 `GL_EXT_shader_explicit_arithmetic_types_float16` 扩展会在设备支持 16 位算术运算且用户开启了 `opt.use_fp16_arithmetic` 选项时,自动启用,无需显式代码指示。 @@ -389,6 +510,10 @@ void main() |NCNN_int8_packed|opt.use_int8_packed| |NCNN_int8_storage|opt.use_int8_storage| |NCNN_int8_arithmetic|opt.use_int8_arithmetic| +|NCNN_int16_packed|opt.use_int16_packed| +|NCNN_int16_storage|opt.use_int16_storage| |NCNN_bf16_packed|opt.use_bf16_packed| |NCNN_bf16_storage|opt.use_bf16_storage| +|NCNN_fp16_uniform|opt.use_fp16_uniform| +|NCNN_int8_uniform|opt.use_int8_uniform| |NCNN_shader_local_memory|opt.use_shader_local_memory| diff --git a/src/c_api.cpp b/src/c_api.cpp index efe8b771368a..4e7b0991f2ce 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -334,6 +334,16 @@ int ncnn_option_get_use_int8_arithmetic(const ncnn_option_t opt) return ((const Option*)opt)->use_int8_arithmetic; } +int ncnn_option_get_use_int16_packed(const ncnn_option_t opt) +{ + return ((const Option*)opt)->use_int16_packed; +} + +int ncnn_option_get_use_int16_storage(const ncnn_option_t opt) +{ + return ((const Option*)opt)->use_int16_storage; +} + int ncnn_option_get_use_bf16_packed(const ncnn_option_t opt) { return ((const Option*)opt)->use_bf16_packed; @@ -424,6 +434,16 @@ void ncnn_option_set_use_int8_arithmetic(ncnn_option_t opt, int enable) ((Option*)opt)->use_int8_arithmetic = enable; } +void ncnn_option_set_use_int16_packed(ncnn_option_t opt, int enable) +{ + ((Option*)opt)->use_int16_packed = enable; +} + +void ncnn_option_set_use_int16_storage(ncnn_option_t opt, int enable) +{ + ((Option*)opt)->use_int16_storage = enable; +} + void ncnn_option_set_use_bf16_packed(ncnn_option_t opt, int enable) { ((Option*)opt)->use_bf16_packed = enable; diff --git a/src/c_api.h b/src/c_api.h index 918cced4ded3..d0013c625f8e 100644 --- a/src/c_api.h +++ b/src/c_api.h @@ -75,6 +75,8 @@ NCNN_EXPORT int ncnn_option_get_use_fp16_arithmetic(const ncnn_option_t opt); NCNN_EXPORT int ncnn_option_get_use_int8_packed(const ncnn_option_t opt); NCNN_EXPORT int ncnn_option_get_use_int8_storage(const ncnn_option_t opt); NCNN_EXPORT int ncnn_option_get_use_int8_arithmetic(const ncnn_option_t opt); +NCNN_EXPORT int ncnn_option_get_use_int16_packed(const ncnn_option_t opt); +NCNN_EXPORT int ncnn_option_get_use_int16_storage(const ncnn_option_t opt); NCNN_EXPORT int ncnn_option_get_use_bf16_packed(const ncnn_option_t opt); NCNN_EXPORT int ncnn_option_get_use_bf16_storage(const ncnn_option_t opt); NCNN_EXPORT int ncnn_option_get_use_shader_local_memory(const ncnn_option_t opt); @@ -91,6 +93,8 @@ NCNN_EXPORT void ncnn_option_set_use_fp16_arithmetic(ncnn_option_t opt, int enab NCNN_EXPORT void ncnn_option_set_use_int8_packed(ncnn_option_t opt, int enable); NCNN_EXPORT void ncnn_option_set_use_int8_storage(ncnn_option_t opt, int enable); NCNN_EXPORT void ncnn_option_set_use_int8_arithmetic(ncnn_option_t opt, int enable); +NCNN_EXPORT void ncnn_option_set_use_int16_packed(ncnn_option_t opt, int enable); +NCNN_EXPORT void ncnn_option_set_use_int16_storage(ncnn_option_t opt, int enable); NCNN_EXPORT void ncnn_option_set_use_bf16_packed(ncnn_option_t opt, int enable); NCNN_EXPORT void ncnn_option_set_use_bf16_storage(ncnn_option_t opt, int enable); NCNN_EXPORT void ncnn_option_set_use_shader_local_memory(ncnn_option_t opt, int enable); diff --git a/src/gpu.cpp b/src/gpu.cpp index 5da5c6ccacb1..cbc44ff29456 100644 --- a/src/gpu.cpp +++ b/src/gpu.cpp @@ -337,6 +337,9 @@ class GpuInfoPrivate bool support_cooperative_matrix_16_8_16; bool support_cooperative_matrix_16_16_16; + // int8 cooperative matrix feature + bool support_int8_cooperative_matrix; + // bf16 cooperative matrix feature bool support_bf16_cooperative_matrix; @@ -1436,6 +1439,7 @@ void GpuInfoPrivate::query_extension_properties() support_cooperative_matrix_16_8_8 = false; support_cooperative_matrix_16_8_16 = false; support_cooperative_matrix_16_16_16 = false; + support_int8_cooperative_matrix = false; support_bf16_cooperative_matrix = false; if (support_VK_KHR_cooperative_matrix && queryCooperativeMatrixFeatures.cooperativeMatrix) { @@ -1493,6 +1497,13 @@ void GpuInfoPrivate::query_extension_properties() support_cooperative_matrix_16_16_16 = true; } + if (cmp.AType == VK_COMPONENT_TYPE_SINT8_KHR && cmp.BType == VK_COMPONENT_TYPE_SINT8_KHR + && cmp.CType == VK_COMPONENT_TYPE_SINT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_SINT32_KHR + && cmp.scope == VK_SCOPE_SUBGROUP_KHR) + { + support_int8_cooperative_matrix = true; + } + if (cmp.AType == VK_COMPONENT_TYPE_BFLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_BFLOAT16_KHR && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.scope == VK_SCOPE_SUBGROUP_KHR) @@ -1556,6 +1567,13 @@ void GpuInfoPrivate::query_extension_properties() { support_cooperative_matrix_16_16_16 = true; } + + if (cmp.AType == VK_COMPONENT_TYPE_SINT8_NV && cmp.BType == VK_COMPONENT_TYPE_SINT8_NV + && cmp.CType == VK_COMPONENT_TYPE_SINT32_NV && cmp.DType == VK_COMPONENT_TYPE_SINT32_NV + && cmp.scope == VK_SCOPE_SUBGROUP_NV) + { + support_int8_cooperative_matrix = true; + } } } @@ -1582,12 +1600,6 @@ void GpuInfoPrivate::query_extension_properties() { NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV failed %d", ret); } - - for (uint32_t j = 0; j < propertyCount; j++) - { - const VkCooperativeMatrixFlexibleDimensionsPropertiesNV& cmfdp = queryCooperativeMatrixFlexibleDimensionsSubPropertiesNV[j]; - // NCNN_LOGE("cmfdp %2d %2d %2d %d %d %d %d %d %d %d", cmfdp.MGranularity, cmfdp.NGranularity, cmfdp.KGranularity, cmfdp.AType, cmfdp.BType, cmfdp.CType, cmfdp.ResultType, cmfdp.saturatingAccumulation, cmfdp.scope, cmfdp.workgroupInvocations); - } } // query supported cooperative vector types and operations @@ -1613,12 +1625,6 @@ void GpuInfoPrivate::query_extension_properties() { NCNN_LOGE("vkGetPhysicalDeviceCooperativeVectorPropertiesNV failed %d", ret); } - - for (uint32_t j = 0; j < propertyCount; j++) - { - const VkCooperativeVectorPropertiesNV& cvp = queryCooperativeVectorSubPropertiesNV[j]; - // NCNN_LOGE("cvp %d %d %d %d %d %d", cvp.inputType, cvp.inputInterpretation, cvp.matrixInterpretation, cvp.biasInterpretation, cvp.resultType, cvp.transpose); - } } if (queryDriverProperties.driverID == VK_DRIVER_ID_MESA_TURNIP) @@ -1943,6 +1949,21 @@ bool GpuInfo::support_int8_arithmetic() const return d->queryFloat16Int8Features.shaderInt8; } +bool GpuInfo::support_int16_packed() const +{ + return true; +} + +bool GpuInfo::support_int16_storage() const +{ + return d->query16BitStorageFeatures.storageBuffer16BitAccess; +} + +bool GpuInfo::support_int16_arithmetic() const +{ + return d->physicalDevicefeatures.shaderInt16; +} + bool GpuInfo::support_bf16_packed() const { return true; @@ -1998,6 +2019,11 @@ bool GpuInfo::support_cooperative_matrix_16_16_16() const return d->support_cooperative_matrix_16_16_16; } +bool GpuInfo::support_int8_cooperative_matrix() const +{ + return d->support_int8_cooperative_matrix && support_int8_arithmetic(); +} + bool GpuInfo::support_bf16_cooperative_matrix() const { return d->support_bf16_cooperative_matrix; @@ -3554,6 +3580,8 @@ const ncnn::Layer* VulkanDevicePrivate::get_utility_operator(int cast_type_from_ opt.use_fp16_storage = use_fp16 && vkdev->info.support_fp16_storage(); opt.use_int8_packed = use_int8; // int8p is always supported opt.use_int8_storage = use_int8 && vkdev->info.support_int8_storage(); + opt.use_int16_packed = false; + opt.use_int16_storage = false; opt.use_bf16_packed = use_bf16; // bf16p is always supported opt.use_bf16_storage = use_bf16 && vkdev->info.support_bf16_storage(); @@ -3633,6 +3661,8 @@ void VulkanDevicePrivate::destroy_utility_operator() opt.use_fp16_storage = use_fp16 && vkdev->info.support_fp16_storage(); opt.use_int8_packed = false; opt.use_int8_storage = false; + opt.use_int16_packed = false; + opt.use_int16_storage = false; opt.use_bf16_packed = false; opt.use_bf16_storage = false; @@ -3660,6 +3690,8 @@ void VulkanDevicePrivate::destroy_utility_operator() opt.use_fp16_storage = false; opt.use_int8_packed = use_int8; opt.use_int8_storage = use_int8 && vkdev->info.support_int8_storage(); + opt.use_int16_packed = false; + opt.use_int16_storage = false; opt.use_bf16_packed = false; opt.use_bf16_storage = false; @@ -3689,6 +3721,8 @@ void VulkanDevicePrivate::destroy_utility_operator() opt.use_fp16_storage = false; opt.use_int8_packed = false; opt.use_int8_storage = false; + opt.use_int16_packed = false; + opt.use_int16_storage = false; opt.use_bf16_packed = use_bf16; opt.use_bf16_storage = use_bf16 && vkdev->info.support_bf16_storage(); @@ -5097,6 +5131,7 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option const GpuInfo& info = get_gpu_info(device_index); const bool support_fp16_storage = info.support_fp16_storage(); const bool support_fp16_uniform = info.support_fp16_uniform(); + const bool support_int16_arithmetic = info.physicalDevicefeatures().shaderInt16; if (opt.use_bf16_storage) { @@ -5448,6 +5483,42 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option custom_defines.append("sint8", "int"); } + if (opt.use_int16_storage) + { + custom_defines.append("NCNN_int16_storage", 1); + custom_defines.append("sint16", "int16_t"); + custom_defines.append("sint16vec4", "i16vec4"); + custom_defines.append("lint16", "int16_t"); + custom_defines.append("lint16vec4", "i16vec4"); + custom_defines.append("aint16", support_int16_arithmetic ? "int16_t" : "int"); + custom_defines.append("aint16vec4", support_int16_arithmetic ? "i16vec4" : "ivec4"); + custom_defines.append("lint162aint16(v)", support_int16_arithmetic ? "v" : "int(v)"); + custom_defines.append("lint162aint16vec4(v)", support_int16_arithmetic ? "v" : "ivec4(v)"); + } + else if (opt.use_int16_packed) + { + custom_defines.append("NCNN_int16_packed", 1); + custom_defines.append("sint16", "int"); + custom_defines.append("sint16vec4", "ivec2"); + custom_defines.append("lint16", "int"); + custom_defines.append("lint16vec4", "ivec2"); + custom_defines.append("aint16", support_int16_arithmetic ? "int16_t" : "int"); + custom_defines.append("aint16vec4", support_int16_arithmetic ? "i16vec4" : "ivec4"); + custom_defines.append("lint162aint16(v)", support_int16_arithmetic ? "int16_t(v)" : "int(v)"); + custom_defines.append("lint162aint16vec4(v)", support_int16_arithmetic ? "i16vec4(unpack16(v.r),unpack16(v.g))" : "ivec4(unpackInt2x16(v.r),unpackInt2x16(v.g))"); + } + else + { + custom_defines.append("sint16", "int"); + custom_defines.append("sint16vec4", "ivec4"); + custom_defines.append("lint16", "int"); + custom_defines.append("lint16vec4", "ivec4"); + custom_defines.append("aint16", "int"); + custom_defines.append("aint16vec4", "ivec4"); + custom_defines.append("lint162aint16(v)", "v"); + custom_defines.append("lint162aint16vec4(v)", "v"); + } + custom_defines.append("sint8vec4", "int"); custom_defines.append("aint8", "int"); @@ -5455,6 +5526,10 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option custom_defines.append("unpackInt4x8(v)", "ivec4((v<<24)>>24,(v<<16)>>24,(v<<8)>>24,v>>24)"); custom_defines.append("packInt4x8(v)", "int((uint(v.r)&0xFFu)|((uint(v.g)&0xFFu)<<8)|((uint(v.b)&0xFFu)<<16)|((uint(v.a)&0xFFu)<<24))"); + custom_defines.append("unpackInt2x16(v)", "ivec2((int(v)<<16)>>16,int(v)>>16)"); + custom_defines.append("packInt2x16(v)", "int((uint(v.r)&0xFFFFu)|((uint(v.g)&0xFFFFu)<<16))"); + custom_defines.append("float2int8(v)", "int(clamp(float(v)+(float(v)>=0.f?0.5f:-0.5f),-127.f,127.f))"); + custom_defines.append("float2int8vec4(v)", "ivec4(clamp(vec4(v)+mix(vec4(-0.5f),vec4(0.5f),greaterThanEqual(vec4(v),vec4(0.f))),vec4(-127.f),vec4(127.f)))"); if (opt.use_int8_storage) { @@ -5470,8 +5545,49 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option } custom_defines.append("i8buffer_ld4(buf,i)", "unpackInt4x8(buf[i])"); + custom_defines.append("i8buffer_sm4(buf,i)", "buf[i]"); custom_defines.append("i8buffer_st4(buf,i,v)", "{buf[i]=packInt4x8(v);}"); custom_defines.append("i8buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); + custom_defines.append("i8buffer_cp1to4(buf,i,sbuf,si)", "{ivec4 _v=ivec4(i8buffer_ld1(sbuf,si.r),i8buffer_ld1(sbuf,si.g),i8buffer_ld1(sbuf,si.b),i8buffer_ld1(sbuf,si.a));i8buffer_st4(buf,i,_v);}"); + custom_defines.append("i8buffer_cp4to1(buf,i4,sbuf,si)", "{ivec4 _v=i8buffer_ld4(sbuf,si);i8buffer_st1(buf,i4.r,_v.r);i8buffer_st1(buf,i4.g,_v.g);i8buffer_st1(buf,i4.b,_v.b);i8buffer_st1(buf,i4.a,_v.a);}"); + + if (opt.use_int16_storage) + { + custom_defines.append("i16buffer_ld1(buf,i)", "int(buf[i])"); + custom_defines.append("i16buffer_st1(buf,i,v)", "{buf[i]=int16_t(v);}"); + } + else if (opt.use_int16_packed) + { + custom_defines.append("i16buffer_ld1(buf,i)", "unpackInt2x16(buf[(i)/2])[(i)%2]"); + custom_defines.append("i16buffer_st1(buf,i,v)", "{uint _i=uint(i);uint _id2=_i/2;uint _im2=_i%2;int _vs=int(v);int _old_v, _new_v;do{_old_v=atomicCompSwap(buf[_id2],0,0);ivec2 _v=unpackInt2x16(_old_v);_v[_im2]=_vs;_new_v=packInt2x16(_v);} while(atomicCompSwap(buf[_id2],_old_v,_new_v)!=_old_v);}"); + } + else + { + custom_defines.append("i16buffer_ld1(buf,i)", "int(buf[i])"); + custom_defines.append("i16buffer_st1(buf,i,v)", "{buf[i]=int(v);}"); + } + custom_defines.append("i16buffer_ld2(buf,i)", "ivec2(i16buffer_ld1(buf,i),i16buffer_ld1(buf,(i)+1))"); + if (opt.use_int16_storage) + { + custom_defines.append("i16buffer_st2(buf,i,v)", "{ivec2 _v=ivec2(v);buf[i]=int16_t(_v.r);buf[(i)+1]=int16_t(_v.g);}"); + custom_defines.append("i16buffer_sm4(buf,i)", "buf[i]"); + custom_defines.append("i16buffer_ld4(buf,i)", support_int16_arithmetic ? "buf[i]" : "ivec4(buf[i])"); + custom_defines.append("i16buffer_st4(buf,i,v)", "{buf[i]=i16vec4(v);}"); + } + else if (opt.use_int16_packed) + { + custom_defines.append("i16buffer_st2(buf,i,v)", "{uint _i=uint(i);ivec2 _v=ivec2(v);if((_i&1u)==0u){buf[_i/2]=packInt2x16(_v);}else{i16buffer_st1(buf,int(_i),_v.r);i16buffer_st1(buf,int(_i)+1,_v.g);}}"); + custom_defines.append("i16buffer_sm4(buf,i)", "buf[i]"); + custom_defines.append("i16buffer_ld4(buf,i)", support_int16_arithmetic ? "i16vec4(unpack16(buf[i].r),unpack16(buf[i].g))" : "ivec4(unpackInt2x16(buf[i].r),unpackInt2x16(buf[i].g))"); + custom_defines.append("i16buffer_st4(buf,i,v)", "{ivec4 _v=ivec4(v);buf[i]=ivec2(packInt2x16(ivec2(_v.r,_v.g)),packInt2x16(ivec2(_v.b,_v.a)));}"); + } + else + { + custom_defines.append("i16buffer_st2(buf,i,v)", "{ivec2 _v=ivec2(v);buf[i]=int(_v.r);buf[(i)+1]=int(_v.g);}"); + custom_defines.append("i16buffer_sm4(buf,i)", "buf[i]"); + custom_defines.append("i16buffer_ld4(buf,i)", "ivec4(buf[i])"); + custom_defines.append("i16buffer_st4(buf,i,v)", "{buf[i]=ivec4(v);}"); + } custom_defines.append("psc(x)", "(x==0?p.x:x)"); @@ -6103,7 +6219,7 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option { custom_exts += "#extension GL_EXT_bfloat16: require\n"; } - if (opt.use_fp16_storage || opt.use_bf16_storage) + if (opt.use_fp16_storage || opt.use_bf16_storage || opt.use_int16_storage) { custom_exts += "#extension GL_EXT_shader_16bit_storage: require\n"; } diff --git a/src/gpu.h b/src/gpu.h index aec8b033fe0a..e313a1a5a2b9 100644 --- a/src/gpu.h +++ b/src/gpu.h @@ -278,7 +278,7 @@ class NCNN_EXPORT GpuInfo // but sometimes bug is a feature bool bug_implicit_fp16_arithmetic() const; - // fp16 and int8 feature + // fp16/int8/int16 feature bool support_fp16_packed() const; bool support_fp16_storage() const; bool support_fp16_uniform() const; @@ -287,6 +287,10 @@ class NCNN_EXPORT GpuInfo bool support_int8_storage() const; bool support_int8_uniform() const; bool support_int8_arithmetic() const; + bool support_int16_packed() const; + // storage only; pair with support_int16_arithmetic() for shader int16 type + bool support_int16_storage() const; + bool support_int16_arithmetic() const; // bf16 feature bool support_bf16_packed() const; @@ -309,6 +313,9 @@ class NCNN_EXPORT GpuInfo bool support_cooperative_matrix_16_8_16() const; bool support_cooperative_matrix_16_16_16() const; + // int8 cooperative matrix feature + bool support_int8_cooperative_matrix() const; + // bf16 cooperative matrix feature bool support_bf16_cooperative_matrix() const; diff --git a/src/layer/arm/convolution_arm.cpp b/src/layer/arm/convolution_arm.cpp index 04566e616edb..e82c25cedaf8 100644 --- a/src/layer/arm/convolution_arm.cpp +++ b/src/layer/arm/convolution_arm.cpp @@ -326,6 +326,13 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option #if NCNN_INT8 if (opt.use_int8_inference && int8_scale_term) { + if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1) + { + NCNN_LOGE("Convolution int8 1d input is not supported, please replace this layer with InnerProduct"); + NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type); + return -1; + } + return forward_int8_arm(bottom_blob, top_blob, opt); } #endif @@ -333,6 +340,9 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option // flattened blob, implement as InnerProduct if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1) { + NCNN_LOGE("Convolution 1d input compatibility path is deprecated and will be removed, please replace this layer with InnerProduct"); + NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type); + Mat bottom_blob_3d; if (bottom_blob.elemsize % 16 == 0) { diff --git a/src/layer/convolution.cpp b/src/layer/convolution.cpp index 8533ee0b0c0e..991180aa93f3 100644 --- a/src/layer/convolution.cpp +++ b/src/layer/convolution.cpp @@ -188,6 +188,13 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op #if NCNN_INT8 if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) { + if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1) + { + NCNN_LOGE("Convolution int8 1d input is not supported, please replace this layer with InnerProduct"); + NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type); + return -1; + } + return forward_int8(bottom_blob, top_blob, opt); } #endif @@ -195,6 +202,9 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op // flattened blob, implement as InnerProduct if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1) { + NCNN_LOGE("Convolution 1d input compatibility path is deprecated and will be removed, please replace this layer with InnerProduct"); + NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type); + int num_input = weight_data_size / num_output; if (bottom_blob.w * bottom_blob.elempack == num_input) { @@ -373,6 +383,13 @@ static inline signed char float2int8(float v) int Convolution::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { + if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1) + { + NCNN_LOGE("Convolution int8 1d input is not supported, please replace this layer with InnerProduct"); + NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type); + return -1; + } + int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; diff --git a/src/layer/loongarch/convolution_loongarch.cpp b/src/layer/loongarch/convolution_loongarch.cpp index 612ff8122edb..0f024d3c2531 100644 --- a/src/layer/loongarch/convolution_loongarch.cpp +++ b/src/layer/loongarch/convolution_loongarch.cpp @@ -444,6 +444,13 @@ int Convolution_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const #if NCNN_INT8 if (opt.use_int8_inference && int8_scale_term) { + if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1) + { + NCNN_LOGE("Convolution int8 1d input is not supported, please replace this layer with InnerProduct"); + NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type); + return -1; + } + #if NCNN_BF16 if (opt.use_bf16_storage && bottom_blob.elembits() == 16) { @@ -469,6 +476,9 @@ int Convolution_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const // flattened blob, implement as InnerProduct if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1) { + NCNN_LOGE("Convolution 1d input compatibility path is deprecated and will be removed, please replace this layer with InnerProduct"); + NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type); + Mat bottom_blob_3d; if (bottom_blob.elemsize % 16 == 0) { @@ -1085,6 +1095,9 @@ int Convolution_loongarch::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, // flattened blob, implement as InnerProduct if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1) { + NCNN_LOGE("Convolution 1d input compatibility path is deprecated and will be removed, please replace this layer with InnerProduct"); + NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type); + Mat bottom_blob_3d; if (bottom_blob.elemsize % 16 == 0) { diff --git a/src/layer/mips/convolution_mips.cpp b/src/layer/mips/convolution_mips.cpp index 9655682afad3..f203c4345888 100644 --- a/src/layer/mips/convolution_mips.cpp +++ b/src/layer/mips/convolution_mips.cpp @@ -382,6 +382,13 @@ int Convolution_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Optio #if NCNN_INT8 if (opt.use_int8_inference && int8_scale_term) { + if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1) + { + NCNN_LOGE("Convolution int8 1d input is not supported, please replace this layer with InnerProduct"); + NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type); + return -1; + } + #if NCNN_BF16 if (opt.use_bf16_storage && bottom_blob.elembits() == 16) { @@ -407,6 +414,9 @@ int Convolution_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Optio // flattened blob, implement as InnerProduct if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1) { + NCNN_LOGE("Convolution 1d input compatibility path is deprecated and will be removed, please replace this layer with InnerProduct"); + NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type); + Mat bottom_blob_3d; if (bottom_blob.elemsize % 16 == 0) { @@ -1019,6 +1029,9 @@ int Convolution_mips::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const // flattened blob, implement as InnerProduct if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1) { + NCNN_LOGE("Convolution 1d input compatibility path is deprecated and will be removed, please replace this layer with InnerProduct"); + NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type); + Mat bottom_blob_3d; if (bottom_blob.elemsize % 16 == 0) { diff --git a/src/layer/riscv/convolution_riscv.cpp b/src/layer/riscv/convolution_riscv.cpp index a45e8bc6223a..c51d673883fa 100644 --- a/src/layer/riscv/convolution_riscv.cpp +++ b/src/layer/riscv/convolution_riscv.cpp @@ -201,6 +201,13 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti #if NCNN_INT8 if (opt.use_int8_inference && int8_scale_term) { + if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1) + { + NCNN_LOGE("Convolution int8 1d input is not supported, please replace this layer with InnerProduct"); + NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type); + return -1; + } + Mat bottom_blob_unpacked = bottom_blob; if (bottom_blob.elempack != 1) { @@ -232,6 +239,9 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti // flattened blob, implement as InnerProduct if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1) { + NCNN_LOGE("Convolution 1d input compatibility path is deprecated and will be removed, please replace this layer with InnerProduct"); + NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type); + Mat bottom_blob_3d; if (bottom_blob.elemsize % 16 == 0) { diff --git a/src/layer/vulkan/convolution_vulkan.cpp b/src/layer/vulkan/convolution_vulkan.cpp index cd35b9b451bb..aab3d049e03e 100644 --- a/src/layer/vulkan/convolution_vulkan.cpp +++ b/src/layer/vulkan/convolution_vulkan.cpp @@ -5,6 +5,7 @@ #include "layer_shader_type.h" #include "layer_type.h" +#include "modelbin.h" namespace ncnn { @@ -17,7 +18,6 @@ Convolution_vulkan::Convolution_vulkan() pipeline_convolution = 0; pipeline_convolution_1x1s1d1 = 0; - pipeline_convolution_gemm = 0; pipeline_convolution_3x3s1d1_winograd23_transform_input = 0; @@ -32,6 +32,11 @@ Convolution_vulkan::Convolution_vulkan() reshape_w = 0; use_cooperative_matrix = false; +#if NCNN_INT8 + quantize = 0; + pipeline_convolution_3x3s1d1_winograd23_gemm_int8_cm = 0; + pipeline_convolution_3x3s1d1_winograd43_gemm_int8_cm = 0; +#endif coopmat_M = 0; coopmat_N = 0; coopmat_K = 0; @@ -52,11 +57,26 @@ int Convolution_vulkan::load_param(const ParamDict& pd) support_vulkan = false; } +#if NCNN_INT8 + if (int8_scale_term && pad_value != 0.f) + { + NCNN_LOGE("Convolution_vulkan int8 nonzero pad value is not supported"); + support_vulkan = false; + } +#endif + return ret; } int Convolution_vulkan::create_pipeline(const Option& opt) { +#if NCNN_INT8 + if (int8_scale_term) + { + return create_pipeline_int8(opt); + } +#endif + Mat shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0]; Mat out_shape = top_shapes.empty() ? Mat() : top_shapes[0]; @@ -1497,6 +1517,13 @@ int Convolution_vulkan::destroy_pipeline(const Option& opt) pipeline_convolution_3x3s1d1_winograd43_gemm = 0; pipeline_convolution_3x3s1d1_winograd43_transform_output = 0; +#if NCNN_INT8 + delete pipeline_convolution_3x3s1d1_winograd23_gemm_int8_cm; + delete pipeline_convolution_3x3s1d1_winograd43_gemm_int8_cm; + pipeline_convolution_3x3s1d1_winograd23_gemm_int8_cm = 0; + pipeline_convolution_3x3s1d1_winograd43_gemm_int8_cm = 0; +#endif + // fc if (reshape_1x1xw) { @@ -1513,6 +1540,14 @@ int Convolution_vulkan::destroy_pipeline(const Option& opt) } use_cooperative_matrix = false; +#if NCNN_INT8 + if (quantize) + { + quantize->destroy_pipeline(opt); + delete quantize; + quantize = 0; + } +#endif coopmat_M = 0; coopmat_N = 0; coopmat_K = 0; @@ -1528,6 +1563,13 @@ int Convolution_vulkan::destroy_pipeline(const Option& opt) int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) { +#if NCNN_INT8 + if (int8_scale_term) + { + return upload_model_int8(cmd, opt); + } +#endif + if (padding) { padding->upload_model(cmd, opt); @@ -1575,6 +1617,13 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const { +#if NCNN_INT8 + if (int8_scale_term) + { + return forward_int8(bottom_blob, top_blob, cmd, opt); + } +#endif + int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; @@ -1584,6 +1633,9 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom // flattened blob, implement as InnerProduct if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1) { + NCNN_LOGE("Convolution 1d input compatibility path is deprecated and will be removed, please replace this layer with InnerProduct"); + NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type); + int num_input = weight_data_size / num_output; if (bottom_blob.w * bottom_blob.elempack == num_input) { @@ -2101,4 +2153,1837 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom return 0; } +#if NCNN_INT8 +int Convolution_vulkan::create_pipeline_int8(const Option& opt) +{ + Mat shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0]; + Mat out_shape = top_shapes.empty() ? Mat() : top_shapes[0]; + + // skip fc like hint + if (shape.dims != 3) shape = Mat(); + if (out_shape.dims != 3) out_shape = Mat(); + + if (weight_data.elemsize != (size_t)1u) + { + NCNN_LOGE("Convolution_vulkan int8 weight data is not int8"); + return -1; + } + + Option opt_int8 = opt; + opt_int8.use_fp16_arithmetic = false; + opt_int8.use_int16_packed = false; + opt_int8.use_int16_storage = false; + const bool use_int8_requantize = int8_scale_term > 100; + const int maxk = kernel_w * kernel_h; + const int num_input = weight_data_size / maxk / num_output; + bool is_conv1x1s1d1 = kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1; + bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1; + bool use_winograd = opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && is_conv3x3s1d1 && num_input >= 16 && num_output >= 16; + bool use_winograd43 = use_winograd && opt.use_winograd43_convolution; + bool use_winograd23 = use_winograd && opt.use_winograd23_convolution; + bool use_gemm = opt.use_sgemm_convolution && !is_conv1x1s1d1 && !use_winograd && num_input * maxk >= 8 && num_output >= 8; + const int elempack = opt.use_packing_layout && num_input % 4 == 0 ? 4 : 1; + const int out_elempack = opt.use_packing_layout && num_output % 4 == 0 ? 4 : 1; + + Mat shape_int8; + if (shape.dims == 3) + { + shape_int8 = Mat(shape.w, shape.h, num_input / elempack, (void*)0, (size_t)elempack, elempack); + } + + Mat shape_int8_bordered; + if (shape_int8.dims == 3) + { + if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0) + { + shape_int8_bordered = Mat(shape_int8.w + pad_left + pad_right, shape_int8.h + pad_top + pad_bottom, shape_int8.c, (void*)0, shape_int8.elemsize, shape_int8.elempack); + } + else if ((pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233) + || (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234)) + { + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + int wpad = kernel_extent_w + (shape_int8.w - 1) / stride_w * stride_w - shape_int8.w; + int hpad = kernel_extent_h + (shape_int8.h - 1) / stride_h * stride_h - shape_int8.h; + if (wpad > 0 || hpad > 0) + { + shape_int8_bordered = Mat(shape_int8.w + wpad, shape_int8.h + hpad, shape_int8.c, (void*)0, shape_int8.elemsize, shape_int8.elempack); + } + else + { + shape_int8_bordered = shape_int8; + } + } + else + { + shape_int8_bordered = shape_int8; + } + } + + if (shape_int8_bordered.dims == 3 && use_winograd43 && use_winograd23) + { + const int w_bordered = shape_int8_bordered.w; + const int h_bordered = shape_int8_bordered.h; + + bool prefer_winograd43 = true; + if (vkdev->info.type() == 0 && ((w_bordered <= 18 && h_bordered <= 18) || ((w_bordered >= 23 && w_bordered <= 24) && (h_bordered >= 23 && h_bordered <= 24)))) + prefer_winograd43 = false; + if (vkdev->info.type() != 0 && (w_bordered <= 12 && h_bordered <= 12)) + prefer_winograd43 = false; + + use_winograd43 = prefer_winograd43; + use_winograd23 = !prefer_winograd43; + } + + Mat shape_padding_int8_bordered; + if (shape_int8_bordered.dims == 3) + { + const int padding_outc = shape_int8_bordered.c * shape_int8_bordered.elempack; + const int padding_out_elempack = padding_outc % 4 == 0 ? 4 : 1; + const size_t padding_out_elemsize = shape_int8_bordered.elemsize / shape_int8_bordered.elempack * padding_out_elempack; + shape_padding_int8_bordered = Mat(shape_int8_bordered.w, shape_int8_bordered.h, padding_outc / padding_out_elempack, (void*)0, padding_out_elemsize, padding_out_elempack); + } + + Mat out_shape_blob; + if (out_shape.dims == 3) + { + size_t out_elemsize; + if (use_int8_requantize) + { + out_elemsize = out_elempack; + } + else if (opt.use_fp16_storage || opt.use_fp16_packed || opt.use_bf16_storage || opt.use_bf16_packed) + { + out_elemsize = (size_t)2u * out_elempack; + } + else + { + out_elemsize = (size_t)4u * out_elempack; + } + + out_shape_blob = Mat(out_shape.w, out_shape.h, num_output / out_elempack, (void*)0, out_elemsize, out_elempack); + } + + { + quantize = ncnn::create_layer_vulkan(ncnn::LayerType::Quantize); + quantize->vkdev = vkdev; + + Mat shape_quantize; + Mat out_shape_quantize; + if (shape.dims == 3) + { + size_t shape_elemsize = shape.elemsize; + if (shape.elempack != 0) + shape_elemsize = shape.elemsize / shape.elempack * elempack; + + shape_quantize = Mat(shape.w, shape.h, num_input / elempack, (void*)0, shape_elemsize, elempack); + out_shape_quantize = shape_int8; + } + + quantize->bottom_shapes.resize(1); + quantize->bottom_shapes[0] = shape_quantize; + quantize->top_shapes.resize(1); + quantize->top_shapes[0] = out_shape_quantize; + + ncnn::ParamDict pd; + pd.set(0, 1); + quantize->load_param(pd); + + Mat weights[1]; + weights[0] = bottom_blob_int8_scales; + quantize->load_model(ModelBinFromMatArray(weights)); + + Option opt_quantize = opt; + opt_quantize.use_fp16_arithmetic = false; + + quantize->create_pipeline(opt_quantize); + } + + { + padding = ncnn::create_layer_vulkan(ncnn::LayerType::Padding); + padding->vkdev = vkdev; + + padding->bottom_shapes.resize(1); + padding->bottom_shapes[0] = shape_int8; + padding->top_shapes.resize(1); + padding->top_shapes[0] = shape_padding_int8_bordered; + + ncnn::ParamDict pd; + pd.set(0, pad_top); + pd.set(1, pad_bottom); + pd.set(2, pad_left); + pd.set(3, pad_right); + pd.set(4, 0); + pd.set(5, 0.f); + + padding->load_param(pd); + + padding->create_pipeline(opt); + } + + const int num_input_packed = (num_input + 3) / 4 * 4; + const int num_output_packed = (num_output + 3) / 4 * 4; + const int c_packed = num_input_packed / 4; + const int outc_pack4 = num_output_packed / 4; + const int c_shader = use_gemm || elempack == 4 ? c_packed : num_input; + + std::vector specializations(13 + 10); + specializations[0].i = kernel_w; + specializations[1].i = kernel_h; + specializations[2].i = dilation_w; + specializations[3].i = dilation_h; + specializations[4].i = stride_w; + specializations[5].i = stride_h; + specializations[6].i = bias_term; + specializations[7].i = activation_type; + specializations[8].f = activation_params.w >= 1 ? activation_params[0] : 0.f; + specializations[9].f = activation_params.w == 2 ? activation_params[1] : 0.f; + specializations[10].i = use_int8_requantize ? 1 : 0; + specializations[11].i = elempack; + specializations[12].i = out_elempack; + specializations[13 + 0].i = shape_int8_bordered.dims != 0 ? shape_int8_bordered.w : 0; + specializations[13 + 1].i = shape_int8_bordered.dims != 0 ? shape_int8_bordered.h : 0; + specializations[13 + 2].i = shape_int8_bordered.dims != 0 ? c_shader : 0; + specializations[13 + 3].i = shape_int8_bordered.dims != 0 ? shape_int8_bordered.cstep : 0; + specializations[13 + 4].i = out_shape_blob.dims != 0 ? out_shape_blob.w : 0; + specializations[13 + 5].i = out_shape_blob.dims != 0 ? out_shape_blob.h : 0; + specializations[13 + 6].i = out_shape_blob.dims != 0 ? outc_pack4 : 0; + specializations[13 + 7].i = out_shape_blob.dims != 0 ? (out_elempack == 4 ? out_shape_blob.cstep : out_shape_blob.cstep * 4) : 0; + specializations[13 + 8].i = num_output; + specializations[13 + 9].i = num_input; + + const bool use_int8_winograd_int16_storage = use_winograd && opt.use_int16_storage && vkdev->info.support_int16_storage() && vkdev->info.support_int16_arithmetic(); + if (use_winograd) + { + if (use_int8_winograd_int16_storage) + opt_int8.use_int16_storage = true; + else + opt_int8.use_int16_packed = true; + } + + use_cooperative_matrix = false; + coopmat_M = 0; + coopmat_N = 0; + coopmat_K = 0; + coopmat_subgroup_size = 0; + + if (use_winograd && opt.use_cooperative_matrix && vkdev->info.support_int8_cooperative_matrix()) + { + int M = 1024; + if (out_shape.dims == 3) + { + const int block_x = use_winograd43 ? (out_shape.w + 3) / 4 : (out_shape.w + 1) / 2; + const int block_y = use_winograd43 ? (out_shape.h + 3) / 4 : (out_shape.h + 1) / 2; + M = block_x * block_y; + } + + const int N = num_output; + const int K = num_input; + + coopmat_subgroup_size = vkdev->info.querySubgroupProperties().subgroupSize; + + double min_cost = 1e300; + + if (vkdev->info.support_VK_KHR_cooperative_matrix() && vkdev->info.queryCooperativeMatrixFeatures().cooperativeMatrix) + { + const std::vector& properties = vkdev->info.queryCooperativeMatrixSubProperties(); + for (size_t i = 0; i < properties.size(); i++) + { + const VkCooperativeMatrixPropertiesKHR& cmp = properties[i]; + if (cmp.AType == VK_COMPONENT_TYPE_SINT8_KHR && cmp.BType == VK_COMPONENT_TYPE_SINT8_KHR + && cmp.CType == VK_COMPONENT_TYPE_SINT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_SINT32_KHR + && cmp.scope == VK_SCOPE_SUBGROUP_KHR + && cmp.MSize % 4 == 0 && cmp.NSize % 4 == 0 && cmp.KSize % 4 == 0) + { + const int M_pad = (M + cmp.MSize - 1) / cmp.MSize * cmp.MSize; + const int N_pad = (N + cmp.NSize - 1) / cmp.NSize * cmp.NSize; + const int K_pad = (K + cmp.KSize - 1) / cmp.KSize * cmp.KSize; + + double cost = (double)M_pad * N_pad * K_pad - (double)M * N * K; + if (cost < min_cost) + { + min_cost = cost; + coopmat_M = cmp.MSize; + coopmat_N = cmp.NSize; + coopmat_K = cmp.KSize; + } + } + } + } + else if (vkdev->info.support_VK_NV_cooperative_matrix() && vkdev->info.queryCooperativeMatrixFeaturesNV().cooperativeMatrix) + { + const std::vector& properties = vkdev->info.queryCooperativeMatrixSubPropertiesNV(); + for (size_t i = 0; i < properties.size(); i++) + { + const VkCooperativeMatrixPropertiesNV& cmp = properties[i]; + if (cmp.AType == VK_COMPONENT_TYPE_SINT8_NV && cmp.BType == VK_COMPONENT_TYPE_SINT8_NV + && cmp.CType == VK_COMPONENT_TYPE_SINT32_NV && cmp.DType == VK_COMPONENT_TYPE_SINT32_NV + && cmp.scope == VK_SCOPE_SUBGROUP_NV + && cmp.MSize % 4 == 0 && cmp.NSize % 4 == 0 && cmp.KSize % 4 == 0) + { + const int M_pad = (M + cmp.MSize - 1) / cmp.MSize * cmp.MSize; + const int N_pad = (N + cmp.NSize - 1) / cmp.NSize * cmp.NSize; + const int K_pad = (K + cmp.KSize - 1) / cmp.KSize * cmp.KSize; + + double cost = (double)M_pad * N_pad * K_pad - (double)M * N * K; + if (cost < min_cost) + { + min_cost = cost; + coopmat_M = cmp.MSize; + coopmat_N = cmp.NSize; + coopmat_K = cmp.KSize; + } + } + } + } + + if (coopmat_M != 0 && coopmat_N != 0 && coopmat_K != 0) + { + use_cooperative_matrix = true; + + UNROLL_SG_M = std::min((M + coopmat_M - 1) / coopmat_M, 2); + UNROLL_SG_N = std::min((N + coopmat_N - 1) / coopmat_N, 2); + UNROLL_SG_K = std::min((K + coopmat_K - 1) / coopmat_K, 2); + + UNROLL_WG_M = std::min((M + coopmat_M * UNROLL_SG_M - 1) / (coopmat_M * UNROLL_SG_M), 2); + UNROLL_WG_N = std::min((N + coopmat_N * UNROLL_SG_N - 1) / (coopmat_N * UNROLL_SG_N), 2); + } + } + else if ((is_conv1x1s1d1 || use_gemm) && opt.use_cooperative_matrix && opt.use_int8_arithmetic && vkdev->info.support_int8_cooperative_matrix()) + { + const int M = out_shape.dims == 3 ? out_shape.w * out_shape.h : 1024; + const int N = num_output; + const int K = is_conv1x1s1d1 ? num_input : num_input * maxk; + + if (N >= 8 && K >= 8) + { + vkdev->info.get_optimal_cooperative_matrix_mnk(M, N, K, VK_COMPONENT_TYPE_SINT8_KHR, VK_COMPONENT_TYPE_SINT32_KHR, VK_SCOPE_SUBGROUP_KHR, coopmat_M, coopmat_N, coopmat_K, coopmat_subgroup_size); + } + + if (coopmat_M != 0 && coopmat_N != 0 && coopmat_K != 0) + { + use_cooperative_matrix = true; + + UNROLL_SG_M = std::min((M + coopmat_M - 1) / coopmat_M, 2); + UNROLL_SG_N = std::min((N + coopmat_N - 1) / coopmat_N, 2); + UNROLL_SG_K = std::min((K + coopmat_K - 1) / coopmat_K, 2); + + UNROLL_WG_M = std::min((M + coopmat_M * UNROLL_SG_M - 1) / (coopmat_M * UNROLL_SG_M), 2); + UNROLL_WG_N = std::min((N + coopmat_N * UNROLL_SG_N - 1) / (coopmat_N * UNROLL_SG_N), 2); + } + } + + if (is_conv1x1s1d1) + { + Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); + + if (use_cooperative_matrix) + { + const signed char* weight_data_ptr = weight_data; + + const int blocks_n = (num_output + coopmat_N * UNROLL_SG_N * UNROLL_WG_N - 1) / (coopmat_N * UNROLL_SG_N * UNROLL_WG_N); + const int kk = (num_input + coopmat_K - 1) / coopmat_K; + const int kk_padded = (kk + UNROLL_SG_K - 1) / UNROLL_SG_K * UNROLL_SG_K; + const int coopmat_Nd4 = coopmat_N / 4; + const int coopmat_Nd4p = coopmat_Nd4 + (vkdev->info.support_VK_KHR_cooperative_matrix() ? 1 : 0); + + const int weight_data_int8_packed_size = coopmat_Nd4p * coopmat_K * UNROLL_SG_N * UNROLL_WG_N * kk_padded; + weight_data_int8_packed.create(weight_data_int8_packed_size, blocks_n, (size_t)4u, 4); + if (weight_data_int8_packed.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int bn = 0; bn < blocks_n; bn++) + { + signed char* p = weight_data_int8_packed.row(bn); + + for (int k = 0; k < kk_padded; k += UNROLL_SG_K) + { + for (int wn = 0; wn < UNROLL_WG_N; wn++) + { + for (int zk = 0; zk < UNROLL_SG_K; zk++) + { + for (int zn = 0; zn < UNROLL_SG_N; zn++) + { + for (int i = 0; i < coopmat_K; i++) + { + for (int j = 0; j < coopmat_Nd4p; j++) + { + for (int jj = 0; jj < 4; jj++) + { + const int gni = ((bn * UNROLL_WG_N + wn) * UNROLL_SG_N + zn) * coopmat_N + j * 4 + jj; + const int gki = (k + zk) * coopmat_K + i; + + *p++ = j < coopmat_Nd4 && gni < num_output && gki < num_input ? weight_data_ptr[gni * num_input + gki] : 0; + } + } + } + } + } + } + } + } + } + else + { + const int num_output_pack4_aligned = ((num_output + 3) / 4 + 7) / 8 * 8; + const int num_output_packed = num_output_pack4_aligned * 4; + + weight_data_int8_packed.create(maxk, num_input_packed / 4, num_output_pack4_aligned, (size_t)4 * 4, 4 * 4); + + for (int q = 0; q < num_output_packed; q += 4) + { + signed char* g00 = weight_data_int8_packed.channel(q / 4); + + for (int p = 0; p < num_input_packed; p += 4) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++) + { + if (q + i < num_output && p + j < num_input) + { + const signed char* k00 = weight_data_r2.channel(q + i).row(p + j); + g00[0] = k00[k]; + } + else + { + g00[0] = 0; + } + g00++; + } + } + } + } + } + } + } + else + { + if (use_gemm) + { + Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); + + if (use_cooperative_matrix) + { + const signed char* weight_data_ptr = weight_data; + + const int K = num_input * maxk; + const int blocks_n = (num_output + coopmat_N * UNROLL_SG_N * UNROLL_WG_N - 1) / (coopmat_N * UNROLL_SG_N * UNROLL_WG_N); + const int kk = (K + coopmat_K - 1) / coopmat_K; + const int kk_padded = (kk + UNROLL_SG_K - 1) / UNROLL_SG_K * UNROLL_SG_K; + const int coopmat_Nd4 = coopmat_N / 4; + const int coopmat_Nd4p = coopmat_Nd4 + (vkdev->info.support_VK_KHR_cooperative_matrix() ? 1 : 0); + + const int weight_data_int8_packed_size = coopmat_Nd4p * coopmat_K * UNROLL_SG_N * UNROLL_WG_N * kk_padded; + weight_data_int8_packed.create(weight_data_int8_packed_size, blocks_n, (size_t)4u, 4); + if (weight_data_int8_packed.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int bn = 0; bn < blocks_n; bn++) + { + signed char* p = weight_data_int8_packed.row(bn); + + for (int k = 0; k < kk_padded; k += UNROLL_SG_K) + { + for (int wn = 0; wn < UNROLL_WG_N; wn++) + { + for (int zk = 0; zk < UNROLL_SG_K; zk++) + { + for (int zn = 0; zn < UNROLL_SG_N; zn++) + { + for (int i = 0; i < coopmat_K; i++) + { + for (int j = 0; j < coopmat_Nd4p; j++) + { + for (int jj = 0; jj < 4; jj++) + { + const int gni = ((bn * UNROLL_WG_N + wn) * UNROLL_SG_N + zn) * coopmat_N + j * 4 + jj; + const int gki = (k + zk) * coopmat_K + i; + + *p++ = j < coopmat_Nd4 && gni < num_output && gki < K ? weight_data_ptr[gni * K + gki] : 0; + } + } + } + } + } + } + } + } + } + else + { + const int num_output_pack4_aligned = ((num_output + 3) / 4 + 7) / 8 * 8; + + weight_data_int8_packed.create(maxk, c_packed, num_output_pack4_aligned, (size_t)4 * 4, 4 * 4); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < num_output_pack4_aligned; q++) + { + signed char* g00 = weight_data_int8_packed.channel(q); + + for (int p = 0; p < num_input_packed; p += 4) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++) + { + if (q * 4 + i < num_output && p + j < num_input) + { + const signed char* k00 = weight_data_r2.channel(q * 4 + i).row(p + j); + g00[0] = k00[k]; + } + else + { + g00[0] = 0; + } + g00++; + } + } + } + } + } + } + } + else if (!use_winograd) + { + Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); + + const int num_output_pack4_aligned = ((num_output + 3) / 4 + 7) / 8 * 8; + + if (elempack == 4) + { + weight_data_int8_packed.create(maxk, c_packed, num_output_pack4_aligned, (size_t)4 * 4, 4 * 4); + } + else + { + weight_data_int8_packed.create(maxk * num_input, num_output_pack4_aligned, (size_t)4u, 4); + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < num_output_pack4_aligned; q++) + { + signed char* g00 = elempack == 4 ? weight_data_int8_packed.channel(q) : weight_data_int8_packed.row(q); + + const int num_input_loop = elempack == 4 ? num_input_packed : num_input; + for (int p = 0; p < num_input_loop; p += elempack) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < elempack; j++) + { + if (q * 4 + i < num_output && p + j < num_input) + { + const signed char* k00 = weight_data_r2.channel(q * 4 + i).row(p + j); + g00[0] = k00[k]; + } + else + { + g00[0] = 0; + } + g00++; + } + } + } + } + } + } + else + { + weight_data_int8_packed = weight_data.reshape(weight_data_size); + } + } + + { + int num_output_packed = (num_output + 3) / 4 * 4; + if (!is_conv1x1s1d1 && !use_winograd && !use_gemm) + num_output_packed = (num_output + 7) / 8 * 8; + + const float bottom_blob_int8_scale = bottom_blob_int8_scales.empty() ? 1.f : bottom_blob_int8_scales[0]; + const float bottom_blob_int8_descale = bottom_blob_int8_scale == 0.f ? 0.f : 1.f / bottom_blob_int8_scale; + + if (is_conv1x1s1d1) + weight_data_int8_descales.create(num_output_packed / 4, (size_t)4u * 4, 4); + else + weight_data_int8_descales.create(num_output_packed, (size_t)4u, 1); + if (weight_data_int8_descales.empty()) + return -100; + + float* outptr = weight_data_int8_descales; + for (int q = 0; q < num_output_packed; q += 4) + { + float scale0 = q + 0 < num_output ? weight_data_int8_scales[q + 0] : 0.f; + float scale1 = q + 1 < num_output ? weight_data_int8_scales[q + 1] : 0.f; + float scale2 = q + 2 < num_output ? weight_data_int8_scales[q + 2] : 0.f; + float scale3 = q + 3 < num_output ? weight_data_int8_scales[q + 3] : 0.f; + outptr[0] = scale0 == 0.f ? 0.f : bottom_blob_int8_descale / scale0; + outptr[1] = scale1 == 0.f ? 0.f : bottom_blob_int8_descale / scale1; + outptr[2] = scale2 == 0.f ? 0.f : bottom_blob_int8_descale / scale2; + outptr[3] = scale3 == 0.f ? 0.f : bottom_blob_int8_descale / scale3; + outptr += 4; + } + } + + if (bias_term) + { + int num_output_packed = (num_output + 3) / 4 * 4; + if (!is_conv1x1s1d1 && !use_winograd && !use_gemm) + num_output_packed = (num_output + 7) / 8 * 8; + + bias_data_int8_packed.create(num_output_packed, (size_t)4u, 1); + if (bias_data_int8_packed.empty()) + return -100; + + float* outptr = bias_data_int8_packed; + for (int q = 0; q < num_output_packed; q += 4) + { + outptr[0] = q + 0 < num_output ? bias_data[q + 0] : 0.f; + outptr[1] = q + 1 < num_output ? bias_data[q + 1] : 0.f; + outptr[2] = q + 2 < num_output ? bias_data[q + 2] : 0.f; + outptr[3] = q + 3 < num_output ? bias_data[q + 3] : 0.f; + outptr += 4; + } + } + + if (use_winograd) + { + if (use_winograd43) + { + Mat weight_data_tm; + weight_data_tm.create(36, num_input, num_output); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < num_output; p++) + { + for (int q = 0; q < num_input; q++) + { + const signed char* kernel0 = (const signed char*)weight_data + p * num_input * 9 + q * 9; + int* kernel_tm0 = weight_data_tm.channel(p).row(q); + + int tmp[6][3]; + for (int m = 0; m < 3; m++) + { + const int r0 = kernel0[0]; + const int r1 = kernel0[1]; + const int r2 = kernel0[2]; + + tmp[0][m] = r0 * 6; + tmp[1][m] = -r0 * 4 - r1 * 4 - r2 * 4; + tmp[2][m] = -r0 * 4 + r1 * 4 - r2 * 4; + tmp[3][m] = r0 + r1 * 2 + r2 * 4; + tmp[4][m] = r0 - r1 * 2 + r2 * 4; + tmp[5][m] = r2 * 6; + + kernel0 += 3; + } + + for (int m = 0; m < 6; m++) + { + const int r0 = tmp[m][0]; + const int r1 = tmp[m][1]; + const int r2 = tmp[m][2]; + + kernel_tm0[m * 6 + 0] = r0 * 6; + kernel_tm0[m * 6 + 1] = -r0 * 4 - r1 * 4 - r2 * 4; + kernel_tm0[m * 6 + 2] = -r0 * 4 + r1 * 4 - r2 * 4; + kernel_tm0[m * 6 + 3] = r0 + r1 * 2 + r2 * 4; + kernel_tm0[m * 6 + 4] = r0 - r1 * 2 + r2 * 4; + kernel_tm0[m * 6 + 5] = r2 * 6; + } + } + } + + { + if (use_cooperative_matrix) + { + const int blocks_n = (num_output + coopmat_N * UNROLL_SG_N * UNROLL_WG_N - 1) / (coopmat_N * UNROLL_SG_N * UNROLL_WG_N); + const int kk = (num_input + coopmat_K - 1) / coopmat_K; + const int kk_padded = (kk + UNROLL_SG_K - 1) / UNROLL_SG_K * UNROLL_SG_K; + const int coopmat_Nd4 = coopmat_N / 4; + const int coopmat_Nd4p = coopmat_Nd4 + (vkdev->info.support_VK_KHR_cooperative_matrix() ? 1 : 0); + + const int weight_data_int8_packed_size = coopmat_Nd4p * coopmat_K * UNROLL_SG_N * UNROLL_WG_N * kk_padded; + weight_winograd43_data_int8_packed_cm.create(weight_data_int8_packed_size, blocks_n, 36, (size_t)8u, 1); + if (weight_winograd43_data_int8_packed_cm.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int b = 0; b < 36; b++) + { + for (int bn = 0; bn < blocks_n; bn++) + { + signed char* p0 = weight_winograd43_data_int8_packed_cm.channel(b).row(bn); + + for (int k = 0; k < kk_padded; k += UNROLL_SG_K) + { + for (int wn = 0; wn < UNROLL_WG_N; wn++) + { + for (int zk = 0; zk < UNROLL_SG_K; zk++) + { + for (int zn = 0; zn < UNROLL_SG_N; zn++) + { + for (int i = 0; i < coopmat_K; i++) + { + for (int j = 0; j < coopmat_Nd4p; j++) + { + for (int jj = 0; jj < 4; jj++) + { + const int gni = ((bn * UNROLL_WG_N + wn) * UNROLL_SG_N + zn) * coopmat_N + j * 4 + jj; + const int gki = (k + zk) * coopmat_K + i; + + const int v = j < coopmat_Nd4 && gni < num_output && gki < num_input ? weight_data_tm.channel(gni).row(gki)[b] : 0; + int vlow = v & 255; + if (vlow >= 128) vlow -= 256; + p0[jj] = (signed char)vlow; + p0[4 + jj] = (signed char)((v - vlow) >> 8); + } + p0 += 8; + } + } + } + } + } + } + } + } + } + { + const int num_input_packed = (num_input + 3) / 4 * 4; + const int num_output_packed = (num_output + 3) / 4 * 4; + const int c4 = num_input_packed / 4; + + weight_winograd43_data_int8_packed.create(c4, num_output_packed, 36, (size_t)8u, 1); + + for (int k = 0; k < 36; k++) + { + int* g00 = weight_winograd43_data_int8_packed.channel(k); + + for (int p = 0; p < num_output_packed; p++) + { + const int* k0 = p < num_output ? weight_data_tm.channel(p) : 0; + + for (int q = 0; q < num_input_packed; q += 4) + { + const int v0 = k0 && q + 0 < num_input ? k0[(q + 0) * 36 + k] : 0; + const int v1 = k0 && q + 1 < num_input ? k0[(q + 1) * 36 + k] : 0; + const int v2 = k0 && q + 2 < num_input ? k0[(q + 2) * 36 + k] : 0; + const int v3 = k0 && q + 3 < num_input ? k0[(q + 3) * 36 + k] : 0; + + g00[0] = (int)(((unsigned int)(unsigned short)v0) | ((unsigned int)(unsigned short)v1 << 16)); + g00[1] = (int)(((unsigned int)(unsigned short)v2) | ((unsigned int)(unsigned short)v3 << 16)); + g00 += 2; + } + } + } + } + } + + { + int block_x = 0; + int block_y = 0; + Mat shape_winograd_input_transformed; + + if (shape_int8_bordered.dims == 3 && out_shape_blob.dims == 3) + { + block_x = (out_shape_blob.w + 3) / 4; + block_y = (out_shape_blob.h + 3) / 4; + if (use_cooperative_matrix) + { + if (elempack == 4) + shape_winograd_input_transformed = Mat(block_x * block_y * 2, 1, c_packed * 36, (void*)0, (size_t)4u, 4); + else + shape_winograd_input_transformed = Mat(block_x * block_y, 1, num_input * 36, (void*)0, (size_t)1u, 1); + } + else + { + if (elempack == 4) + shape_winograd_input_transformed = Mat(block_x * block_y, 1, c_packed * 36, (void*)0, (size_t)8u, 4); + else + shape_winograd_input_transformed = Mat(block_x * block_y, 1, num_input * 36, (void*)0, (size_t)2u, 1); + } + } + + std::vector specializations_winograd_input(1 + 6); + specializations_winograd_input[0].i = elempack == 4 ? c_packed : num_input; + specializations_winograd_input[1 + 0].i = shape_int8_bordered.dims != 0 ? shape_int8_bordered.w : 0; + specializations_winograd_input[1 + 1].i = shape_int8_bordered.dims != 0 ? shape_int8_bordered.h : 0; + specializations_winograd_input[1 + 2].i = shape_int8_bordered.dims != 0 ? shape_int8_bordered.cstep : 0; + specializations_winograd_input[1 + 3].i = shape_winograd_input_transformed.dims != 0 ? shape_winograd_input_transformed.cstep : 0; + specializations_winograd_input[1 + 4].i = block_x; + specializations_winograd_input[1 + 5].i = block_y; + + int shader_type_index = -1; + if (use_cooperative_matrix) + { + if (elempack == 1) shader_type_index = LayerShaderType::convolution_3x3s1d1_winograd43_transform_input_int8_cm; + if (elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd43_transform_input_int8_cm; + } + else + { + if (elempack == 1) shader_type_index = LayerShaderType::convolution_3x3s1d1_winograd43_transform_input_int8; + if (elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd43_transform_input_int8; + } + + pipeline_convolution_3x3s1d1_winograd43_transform_input = new Pipeline(vkdev); + pipeline_convolution_3x3s1d1_winograd43_transform_input->set_local_size_xyz(4, 4, 1); + pipeline_convolution_3x3s1d1_winograd43_transform_input->create(shader_type_index, opt_int8, specializations_winograd_input); + } + { + // winograd23/43 share gemm shader, transform count is set by dispatcher.c + pipeline_convolution_3x3s1d1_winograd43_gemm = new Pipeline(vkdev); + { + std::vector specializations_winograd_gemm(5 + 3); + specializations_winograd_gemm[0].i = 36; + specializations_winograd_gemm[1].i = num_input; + specializations_winograd_gemm[2].i = num_output; + specializations_winograd_gemm[3].i = elempack; + specializations_winograd_gemm[4].i = out_elempack; + specializations_winograd_gemm[5 + 0].i = 0; + specializations_winograd_gemm[5 + 1].i = 0; + specializations_winograd_gemm[5 + 2].i = 0; + + pipeline_convolution_3x3s1d1_winograd43_gemm->set_local_size_xyz(opt_int8.use_shader_local_memory ? 8 : 4, opt_int8.use_shader_local_memory ? 8 : std::min(4, (num_output + 3) / 4), opt_int8.use_shader_local_memory ? 1 : 4); + pipeline_convolution_3x3s1d1_winograd43_gemm->create(LayerShaderType::convolution_3x3s1d1_winograd_gemm_int8, opt_int8, specializations_winograd_gemm); + } + + if (use_cooperative_matrix) + { + std::vector specializations_winograd_gemm(15 + 3); + specializations_winograd_gemm[0].u32 = 36; + specializations_winograd_gemm[1].u32 = coopmat_M; + specializations_winograd_gemm[2].u32 = coopmat_N; + specializations_winograd_gemm[3].u32 = coopmat_K; + specializations_winograd_gemm[4].u32 = UNROLL_SG_M; + specializations_winograd_gemm[5].u32 = UNROLL_SG_N; + specializations_winograd_gemm[6].u32 = UNROLL_SG_K; + specializations_winograd_gemm[7].u32 = UNROLL_WG_M; + specializations_winograd_gemm[8].u32 = UNROLL_WG_N; + specializations_winograd_gemm[9].u32 = coopmat_subgroup_size; + specializations_winograd_gemm[10].u32 = num_input; + specializations_winograd_gemm[11].u32 = num_output; + specializations_winograd_gemm[12].u32 = elempack; + specializations_winograd_gemm[13].u32 = out_elempack; + specializations_winograd_gemm[14].u32 = weight_winograd43_data_int8_packed_cm.cstep; + specializations_winograd_gemm[15 + 0].u32 = 0; + specializations_winograd_gemm[15 + 1].u32 = 0; + specializations_winograd_gemm[15 + 2].u32 = 0; + + pipeline_convolution_3x3s1d1_winograd43_gemm_int8_cm = new Pipeline(vkdev); + pipeline_convolution_3x3s1d1_winograd43_gemm_int8_cm->set_subgroup_size(coopmat_subgroup_size); + pipeline_convolution_3x3s1d1_winograd43_gemm_int8_cm->set_local_size_xyz(coopmat_subgroup_size * UNROLL_WG_M * UNROLL_WG_N, 1, 1); + pipeline_convolution_3x3s1d1_winograd43_gemm_int8_cm->create(LayerShaderType::convolution_3x3s1d1_winograd_gemm_int8_cm, opt_int8, specializations_winograd_gemm); + } + } + { + std::vector specializations_winograd_output(5); + specializations_winograd_output[0].i = bias_term; + specializations_winograd_output[1].i = activation_type; + specializations_winograd_output[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f; + specializations_winograd_output[3].f = activation_params.w == 2 ? activation_params[1] : 0.f; + specializations_winograd_output[4].i = use_int8_requantize ? 1 : 0; + + int shader_type_index = -1; + if (out_elempack == 1) shader_type_index = LayerShaderType::convolution_3x3s1d1_winograd43_transform_output_int8; + if (out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd43_transform_output_int8; + + pipeline_convolution_3x3s1d1_winograd43_transform_output = new Pipeline(vkdev); + pipeline_convolution_3x3s1d1_winograd43_transform_output->set_local_size_xyz(4, 4, 1); + pipeline_convolution_3x3s1d1_winograd43_transform_output->create(shader_type_index, opt_int8, specializations_winograd_output); + } + } + + if (use_winograd23) + { + Mat weight_data_tm; + weight_data_tm.create(16, num_input, num_output); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < num_output; p++) + { + for (int q = 0; q < num_input; q++) + { + const signed char* kernel0 = (const signed char*)weight_data + p * num_input * 9 + q * 9; + int* kernel_tm0 = weight_data_tm.channel(p).row(q); + + int tmp[4][3]; + for (int m = 0; m < 3; m++) + { + const int r0 = kernel0[0]; + const int r1 = kernel0[1]; + const int r2 = kernel0[2]; + + tmp[0][m] = r0 * 2; + tmp[1][m] = r0 + r1 + r2; + tmp[2][m] = r0 - r1 + r2; + tmp[3][m] = r2 * 2; + + kernel0 += 3; + } + + for (int m = 0; m < 4; m++) + { + const int r0 = tmp[m][0]; + const int r1 = tmp[m][1]; + const int r2 = tmp[m][2]; + + kernel_tm0[m * 4 + 0] = r0 * 2; + kernel_tm0[m * 4 + 1] = r0 + r1 + r2; + kernel_tm0[m * 4 + 2] = r0 - r1 + r2; + kernel_tm0[m * 4 + 3] = r2 * 2; + } + } + } + + { + if (use_cooperative_matrix) + { + const int blocks_n = (num_output + coopmat_N * UNROLL_SG_N * UNROLL_WG_N - 1) / (coopmat_N * UNROLL_SG_N * UNROLL_WG_N); + const int kk = (num_input + coopmat_K - 1) / coopmat_K; + const int kk_padded = (kk + UNROLL_SG_K - 1) / UNROLL_SG_K * UNROLL_SG_K; + const int coopmat_Nd4 = coopmat_N / 4; + const int coopmat_Nd4p = coopmat_Nd4 + (vkdev->info.support_VK_KHR_cooperative_matrix() ? 1 : 0); + + const int weight_data_int8_packed_size = coopmat_Nd4p * coopmat_K * UNROLL_SG_N * UNROLL_WG_N * kk_padded; + weight_winograd23_data_int8_packed_cm.create(weight_data_int8_packed_size, blocks_n, 16, (size_t)8u, 1); + if (weight_winograd23_data_int8_packed_cm.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int b = 0; b < 16; b++) + { + for (int bn = 0; bn < blocks_n; bn++) + { + signed char* p0 = weight_winograd23_data_int8_packed_cm.channel(b).row(bn); + + for (int k = 0; k < kk_padded; k += UNROLL_SG_K) + { + for (int wn = 0; wn < UNROLL_WG_N; wn++) + { + for (int zk = 0; zk < UNROLL_SG_K; zk++) + { + for (int zn = 0; zn < UNROLL_SG_N; zn++) + { + for (int i = 0; i < coopmat_K; i++) + { + for (int j = 0; j < coopmat_Nd4p; j++) + { + for (int jj = 0; jj < 4; jj++) + { + const int gni = ((bn * UNROLL_WG_N + wn) * UNROLL_SG_N + zn) * coopmat_N + j * 4 + jj; + const int gki = (k + zk) * coopmat_K + i; + + const int v = j < coopmat_Nd4 && gni < num_output && gki < num_input ? weight_data_tm.channel(gni).row(gki)[b] : 0; + int vlow = v & 255; + if (vlow >= 128) vlow -= 256; + p0[jj] = (signed char)vlow; + p0[4 + jj] = (signed char)((v - vlow) >> 8); + } + p0 += 8; + } + } + } + } + } + } + } + } + } + { + const int num_input_packed = (num_input + 3) / 4 * 4; + const int num_output_packed = (num_output + 3) / 4 * 4; + const int c4 = num_input_packed / 4; + + weight_winograd23_data_int8_packed.create(c4, num_output_packed, 16, (size_t)8u, 1); + + for (int k = 0; k < 16; k++) + { + int* g00 = weight_winograd23_data_int8_packed.channel(k); + + for (int p = 0; p < num_output_packed; p++) + { + const int* k0 = p < num_output ? weight_data_tm.channel(p) : 0; + + for (int q = 0; q < num_input_packed; q += 4) + { + const int v0 = k0 && q + 0 < num_input ? k0[(q + 0) * 16 + k] : 0; + const int v1 = k0 && q + 1 < num_input ? k0[(q + 1) * 16 + k] : 0; + const int v2 = k0 && q + 2 < num_input ? k0[(q + 2) * 16 + k] : 0; + const int v3 = k0 && q + 3 < num_input ? k0[(q + 3) * 16 + k] : 0; + + g00[0] = (int)(((unsigned int)(unsigned short)v0) | ((unsigned int)(unsigned short)v1 << 16)); + g00[1] = (int)(((unsigned int)(unsigned short)v2) | ((unsigned int)(unsigned short)v3 << 16)); + g00 += 2; + } + } + } + } + } + + { + int block_x = 0; + int block_y = 0; + Mat shape_winograd_input_transformed; + + if (shape_int8_bordered.dims == 3 && out_shape_blob.dims == 3) + { + block_x = (out_shape_blob.w + 1) / 2; + block_y = (out_shape_blob.h + 1) / 2; + if (use_cooperative_matrix) + { + if (elempack == 4) + shape_winograd_input_transformed = Mat(block_x * block_y * 2, 1, c_packed * 16, (void*)0, (size_t)4u, 4); + else + shape_winograd_input_transformed = Mat(block_x * block_y, 1, num_input * 16, (void*)0, (size_t)1u, 1); + } + else + { + if (elempack == 4) + shape_winograd_input_transformed = Mat(block_x * block_y, 1, c_packed * 16, (void*)0, (size_t)8u, 4); + else + shape_winograd_input_transformed = Mat(block_x * block_y, 1, num_input * 16, (void*)0, (size_t)2u, 1); + } + } + + std::vector specializations_winograd_input(1 + 6); + specializations_winograd_input[0].i = elempack == 4 ? c_packed : num_input; + specializations_winograd_input[1 + 0].i = shape_int8_bordered.dims != 0 ? shape_int8_bordered.w : 0; + specializations_winograd_input[1 + 1].i = shape_int8_bordered.dims != 0 ? shape_int8_bordered.h : 0; + specializations_winograd_input[1 + 2].i = shape_int8_bordered.dims != 0 ? shape_int8_bordered.cstep : 0; + specializations_winograd_input[1 + 3].i = shape_winograd_input_transformed.dims != 0 ? shape_winograd_input_transformed.cstep : 0; + specializations_winograd_input[1 + 4].i = block_x; + specializations_winograd_input[1 + 5].i = block_y; + + int shader_type_index = -1; + if (use_cooperative_matrix) + { + if (elempack == 1) shader_type_index = LayerShaderType::convolution_3x3s1d1_winograd23_transform_input_int8_cm; + if (elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd23_transform_input_int8_cm; + } + else + { + if (elempack == 1) shader_type_index = LayerShaderType::convolution_3x3s1d1_winograd23_transform_input_int8; + if (elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd23_transform_input_int8; + } + + pipeline_convolution_3x3s1d1_winograd23_transform_input = new Pipeline(vkdev); + pipeline_convolution_3x3s1d1_winograd23_transform_input->set_local_size_xyz(8, 8, 1); + pipeline_convolution_3x3s1d1_winograd23_transform_input->create(shader_type_index, opt_int8, specializations_winograd_input); + } + { + // winograd23/43 share gemm shader, transform count is set by dispatcher.c + pipeline_convolution_3x3s1d1_winograd23_gemm = new Pipeline(vkdev); + { + std::vector specializations_winograd_gemm(5 + 3); + specializations_winograd_gemm[0].i = 16; + specializations_winograd_gemm[1].i = num_input; + specializations_winograd_gemm[2].i = num_output; + specializations_winograd_gemm[3].i = elempack; + specializations_winograd_gemm[4].i = out_elempack; + specializations_winograd_gemm[5 + 0].i = 0; + specializations_winograd_gemm[5 + 1].i = 0; + specializations_winograd_gemm[5 + 2].i = 0; + + pipeline_convolution_3x3s1d1_winograd23_gemm->set_local_size_xyz(opt_int8.use_shader_local_memory ? 8 : 4, opt_int8.use_shader_local_memory ? 8 : std::min(4, (num_output + 3) / 4), opt_int8.use_shader_local_memory ? 1 : 4); + pipeline_convolution_3x3s1d1_winograd23_gemm->create(LayerShaderType::convolution_3x3s1d1_winograd_gemm_int8, opt_int8, specializations_winograd_gemm); + } + + if (use_cooperative_matrix) + { + std::vector specializations_winograd_gemm(15 + 3); + specializations_winograd_gemm[0].u32 = 16; + specializations_winograd_gemm[1].u32 = coopmat_M; + specializations_winograd_gemm[2].u32 = coopmat_N; + specializations_winograd_gemm[3].u32 = coopmat_K; + specializations_winograd_gemm[4].u32 = UNROLL_SG_M; + specializations_winograd_gemm[5].u32 = UNROLL_SG_N; + specializations_winograd_gemm[6].u32 = UNROLL_SG_K; + specializations_winograd_gemm[7].u32 = UNROLL_WG_M; + specializations_winograd_gemm[8].u32 = UNROLL_WG_N; + specializations_winograd_gemm[9].u32 = coopmat_subgroup_size; + specializations_winograd_gemm[10].u32 = num_input; + specializations_winograd_gemm[11].u32 = num_output; + specializations_winograd_gemm[12].u32 = elempack; + specializations_winograd_gemm[13].u32 = out_elempack; + specializations_winograd_gemm[14].u32 = weight_winograd23_data_int8_packed_cm.cstep; + specializations_winograd_gemm[15 + 0].u32 = 0; + specializations_winograd_gemm[15 + 1].u32 = 0; + specializations_winograd_gemm[15 + 2].u32 = 0; + + pipeline_convolution_3x3s1d1_winograd23_gemm_int8_cm = new Pipeline(vkdev); + pipeline_convolution_3x3s1d1_winograd23_gemm_int8_cm->set_subgroup_size(coopmat_subgroup_size); + pipeline_convolution_3x3s1d1_winograd23_gemm_int8_cm->set_local_size_xyz(coopmat_subgroup_size * UNROLL_WG_M * UNROLL_WG_N, 1, 1); + pipeline_convolution_3x3s1d1_winograd23_gemm_int8_cm->create(LayerShaderType::convolution_3x3s1d1_winograd_gemm_int8_cm, opt_int8, specializations_winograd_gemm); + } + } + { + std::vector specializations_winograd_output(5); + specializations_winograd_output[0].i = bias_term; + specializations_winograd_output[1].i = activation_type; + specializations_winograd_output[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f; + specializations_winograd_output[3].f = activation_params.w == 2 ? activation_params[1] : 0.f; + specializations_winograd_output[4].i = use_int8_requantize ? 1 : 0; + + int shader_type_index = -1; + if (out_elempack == 1) shader_type_index = LayerShaderType::convolution_3x3s1d1_winograd23_transform_output_int8; + if (out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd23_transform_output_int8; + + pipeline_convolution_3x3s1d1_winograd23_transform_output = new Pipeline(vkdev); + pipeline_convolution_3x3s1d1_winograd23_transform_output->set_local_size_xyz(8, 8, 1); + pipeline_convolution_3x3s1d1_winograd23_transform_output->create(shader_type_index, opt_int8, specializations_winograd_output); + } + } + } + else if (is_conv1x1s1d1) + { + if (use_cooperative_matrix) + { + std::vector specializations_1x1(7 + 5 + 9); + specializations_1x1[0].i = bias_term; + specializations_1x1[1].i = activation_type; + specializations_1x1[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f; + specializations_1x1[3].f = activation_params.w == 2 ? activation_params[1] : 0.f; + specializations_1x1[4].i = use_int8_requantize ? 1 : 0; + specializations_1x1[5].u32 = elempack; + specializations_1x1[6].u32 = out_elempack; + specializations_1x1[7 + 0].u32 = shape_int8_bordered.dims != 0 ? shape_int8_bordered.cstep : 0; + specializations_1x1[7 + 1].u32 = out_shape_blob.dims != 0 ? out_shape_blob.cstep : 0; + specializations_1x1[7 + 2].u32 = out_shape.dims != 0 ? out_shape.w * out_shape.h : 0; + specializations_1x1[7 + 3].u32 = num_output; + specializations_1x1[7 + 4].u32 = num_input; + specializations_1x1[12 + 0].u32 = coopmat_M; + specializations_1x1[12 + 1].u32 = coopmat_N; + specializations_1x1[12 + 2].u32 = coopmat_K; + specializations_1x1[12 + 3].u32 = coopmat_subgroup_size; + specializations_1x1[12 + 4].u32 = UNROLL_SG_M; + specializations_1x1[12 + 5].u32 = UNROLL_SG_N; + specializations_1x1[12 + 6].u32 = UNROLL_SG_K; + specializations_1x1[12 + 7].u32 = UNROLL_WG_M; + specializations_1x1[12 + 8].u32 = UNROLL_WG_N; + + pipeline_convolution_1x1s1d1 = new Pipeline(vkdev); + pipeline_convolution_1x1s1d1->set_subgroup_size(coopmat_subgroup_size); + pipeline_convolution_1x1s1d1->set_local_size_xyz(coopmat_subgroup_size * UNROLL_WG_M * UNROLL_WG_N, 1, 1); + pipeline_convolution_1x1s1d1->create(LayerShaderType::convolution_1x1s1d1_int8_cm, opt_int8, specializations_1x1); + } + else + { + const int num_input_packed = (num_input + 3) / 4 * 4; + const int num_output_packed = (num_output + 3) / 4 * 4; + + const int c_packed = num_input_packed / 4; + const int cstep_vec4 = shape_int8_bordered.dims != 0 ? (elempack == 4 ? shape_int8_bordered.cstep : shape_int8_bordered.cstep / 4) : 0; + const int outc_pack4 = num_output_packed / 4; + + std::vector specializations_1x1(7 + 8); + specializations_1x1[0].i = bias_term; + specializations_1x1[1].i = activation_type; + specializations_1x1[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f; + specializations_1x1[3].f = activation_params.w == 2 ? activation_params[1] : 0.f; + specializations_1x1[4].i = use_int8_requantize ? 1 : 0; + specializations_1x1[5].i = elempack; + specializations_1x1[6].i = out_elempack; + specializations_1x1[7 + 0].i = c_packed; + specializations_1x1[7 + 1].i = cstep_vec4; + specializations_1x1[7 + 2].i = out_shape.dims != 0 ? outc_pack4 : 0; + specializations_1x1[7 + 3].i = out_shape_blob.dims != 0 ? (out_elempack == 4 ? out_shape_blob.cstep : out_shape_blob.cstep / 4) : 0; + specializations_1x1[7 + 4].i = out_shape_blob.dims != 0 ? out_shape_blob.cstep / 4 : 0; + specializations_1x1[7 + 5].i = out_shape.dims != 0 ? (out_shape.w * out_shape.h + 3) / 4 : 0; + specializations_1x1[7 + 6].i = num_output; + specializations_1x1[7 + 7].i = num_input; + + Mat local_size_xyz(8, std::min(8, outc_pack4), 1, (void*)0); + + pipeline_convolution_1x1s1d1 = new Pipeline(vkdev); + if (opt_int8.use_shader_local_memory) + { + pipeline_convolution_1x1s1d1->set_local_size_xyz(8, 8, 1); + } + else + { + pipeline_convolution_1x1s1d1->set_optimal_local_size_xyz(local_size_xyz); + } + pipeline_convolution_1x1s1d1->create(LayerShaderType::convolution_packed_1x1s1d1_int8, opt_int8, specializations_1x1); + } + } + else if (use_gemm) + { + if (use_cooperative_matrix) + { + std::vector specializations_gemm(13 + 8 + 9); + specializations_gemm[0].u32 = kernel_w; + specializations_gemm[1].u32 = kernel_h; + specializations_gemm[2].u32 = dilation_w; + specializations_gemm[3].u32 = dilation_h; + specializations_gemm[4].u32 = stride_w; + specializations_gemm[5].u32 = stride_h; + specializations_gemm[6].i = bias_term; + specializations_gemm[7].i = activation_type; + specializations_gemm[8].f = activation_params.w >= 1 ? activation_params[0] : 0.f; + specializations_gemm[9].f = activation_params.w == 2 ? activation_params[1] : 0.f; + specializations_gemm[10].i = use_int8_requantize ? 1 : 0; + specializations_gemm[11].u32 = elempack; + specializations_gemm[12].u32 = out_elempack; + specializations_gemm[13 + 0].u32 = shape_int8_bordered.dims != 0 ? shape_int8_bordered.w : 0; + specializations_gemm[13 + 1].u32 = shape_int8_bordered.dims != 0 ? shape_int8_bordered.h : 0; + specializations_gemm[13 + 2].u32 = shape_int8_bordered.dims != 0 ? shape_int8_bordered.cstep : 0; + specializations_gemm[13 + 3].u32 = out_shape_blob.dims != 0 ? out_shape_blob.w : 0; + specializations_gemm[13 + 4].u32 = out_shape_blob.dims != 0 ? out_shape_blob.h : 0; + specializations_gemm[13 + 5].u32 = out_shape_blob.dims != 0 ? out_shape_blob.cstep : 0; + specializations_gemm[13 + 6].u32 = num_output; + specializations_gemm[13 + 7].u32 = num_input; + specializations_gemm[21 + 0].u32 = coopmat_M; + specializations_gemm[21 + 1].u32 = coopmat_N; + specializations_gemm[21 + 2].u32 = coopmat_K; + specializations_gemm[21 + 3].u32 = coopmat_subgroup_size; + specializations_gemm[21 + 4].u32 = UNROLL_SG_M; + specializations_gemm[21 + 5].u32 = UNROLL_SG_N; + specializations_gemm[21 + 6].u32 = UNROLL_SG_K; + specializations_gemm[21 + 7].u32 = UNROLL_WG_M; + specializations_gemm[21 + 8].u32 = UNROLL_WG_N; + + pipeline_convolution_gemm = new Pipeline(vkdev); + pipeline_convolution_gemm->set_subgroup_size(coopmat_subgroup_size); + pipeline_convolution_gemm->set_local_size_xyz(coopmat_subgroup_size * UNROLL_WG_M * UNROLL_WG_N, 1, 1); + pipeline_convolution_gemm->create(LayerShaderType::convolution_gemm_int8_cm, opt_int8, specializations_gemm); + } + else + { + const int outc_pack4 = (num_output + 3) / 4; + const int outsize = shape.dims == 3 ? (shape.w * shape.h + 3) / 4 : 16; + Mat local_size_xyz(std::min(8, outsize), std::min(8, outc_pack4), 1, (void*)0); + + pipeline_convolution_gemm = new Pipeline(vkdev); + if (opt_int8.use_shader_local_memory) + { + pipeline_convolution_gemm->set_local_size_xyz(8, 8, 1); + } + else + { + pipeline_convolution_gemm->set_optimal_local_size_xyz(local_size_xyz); + } + pipeline_convolution_gemm->create(LayerShaderType::convolution_packed_gemm_int8, opt_int8, specializations); + } + } + else + { + const int outc_pack4 = (num_output + 3) / 4; + Mat local_size_xyz(8, 8, std::min(4, (outc_pack4 + 1) / 2), (void*)0); + + std::vector specializations_direct = specializations; + for (int i = 0; i < 8; i++) + { + specializations_direct[13 + i].i = 0; + } + + pipeline_convolution = new Pipeline(vkdev); + pipeline_convolution->set_optimal_local_size_xyz(local_size_xyz); + pipeline_convolution->create(LayerShaderType::convolution_packed_int8, opt_int8, specializations_direct); + } + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; +} + +int Convolution_vulkan::upload_model_int8(VkTransfer& cmd, const Option& opt) +{ + Option opt_fp32 = opt; + opt_fp32.use_fp16_packed = false; + opt_fp32.use_fp16_storage = false; + opt_fp32.use_bf16_packed = false; + opt_fp32.use_bf16_storage = false; + + const int maxk = kernel_w * kernel_h; + const int num_input = weight_data_size / maxk / num_output; + + const bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1; + const bool use_winograd = opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && is_conv3x3s1d1 && num_input >= 16 && num_output >= 16; + + if (use_winograd) + { + if (use_cooperative_matrix) + { + if (!weight_winograd43_data_int8_packed_cm.empty()) + { + cmd.record_upload(weight_winograd43_data_int8_packed_cm, weight_data_gpu_tm_winograd43_int8_cm, opt); + + weight_winograd43_data_int8_packed_cm.release(); + } + + if (!weight_winograd23_data_int8_packed_cm.empty()) + { + cmd.record_upload(weight_winograd23_data_int8_packed_cm, weight_data_gpu_tm_winograd23_int8_cm, opt); + + weight_winograd23_data_int8_packed_cm.release(); + } + + weight_winograd43_data_int8_packed.release(); + weight_winograd23_data_int8_packed.release(); + } + else + { + weight_winograd43_data_int8_packed_cm.release(); + weight_winograd23_data_int8_packed_cm.release(); + + if (!weight_winograd43_data_int8_packed.empty()) + { + cmd.record_upload(weight_winograd43_data_int8_packed, weight_data_gpu_tm_winograd43, opt_fp32); + + weight_winograd43_data_int8_packed.release(); + } + + if (!weight_winograd23_data_int8_packed.empty()) + { + cmd.record_upload(weight_winograd23_data_int8_packed, weight_data_gpu_tm_winograd23, opt_fp32); + + weight_winograd23_data_int8_packed.release(); + } + } + + weight_data_int8_packed.release(); + } + else + { + cmd.record_upload(weight_data_int8_packed, weight_data_gpu, opt); + + weight_data_int8_packed.release(); + } + + cmd.record_upload(weight_data_int8_descales, weight_data_int8_descales_gpu, opt_fp32); + + weight_data_int8_descales.release(); + + const bool use_int8_requantize = int8_scale_term > 100; + if (use_int8_requantize) + { + cmd.record_upload(top_blob_int8_scales, top_blob_int8_scales_gpu, opt); + } + + if (bias_term) + { + cmd.record_upload(bias_data_int8_packed, bias_data_gpu, opt_fp32); + + bias_data_int8_packed.release(); + bias_data.release(); + } + + if (padding) + { + padding->upload_model(cmd, opt); + } + + quantize->upload_model(cmd, opt); + + return 0; +} + +int Convolution_vulkan::forward_int8(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const +{ + const int maxk = kernel_w * kernel_h; + const int num_input = weight_data_size / maxk / num_output; + const bool is_conv1x1s1d1 = kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1; + const bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1; + const bool use_winograd = opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && is_conv3x3s1d1 && num_input >= 16 && num_output >= 16; + const bool use_gemm = opt.use_sgemm_convolution && !is_conv1x1s1d1 && !use_winograd && num_input * maxk >= 8 && num_output >= 8; + + // flattened blob, implement as InnerProduct + if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1) + { + NCNN_LOGE("Convolution int8 1d input is not supported, please replace this layer with InnerProduct"); + NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type); + return -1; + } + + if (bottom_blob.dims != 3) + { + NCNN_LOGE("Convolution_vulkan int8 only supports 3d input for now"); + return -1; + } + + VkMat bottom = bottom_blob; + bool bottom_is_int8 = bottom.elembits() == 8; + + const int elempack = opt.use_packing_layout && num_input % 4 == 0 ? 4 : 1; + + if (!bottom_is_int8) + { + Option opt_quantize = opt; + opt_quantize.blob_vkallocator = opt.workspace_vkallocator; + opt_quantize.use_fp16_arithmetic = false; + + VkMat bottom_int8; + int ret = quantize->forward(bottom, bottom_int8, cmd, opt_quantize); + if (ret != 0) + return ret; + + bottom = bottom_int8; + bottom_is_int8 = true; + } + + int w = bottom.w; + int h = bottom.h; + const int channels = bottom.c * bottom.elempack; + + if (channels != num_input) + { + NCNN_LOGE("Convolution_vulkan int8 input channels mismatch"); + return -1; + } + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0) + { + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + + VkMat bottom_bordered; + int ret = padding->forward(bottom, bottom_bordered, cmd, opt_pad); + if (ret != 0) + return ret; + + bottom = bottom_bordered; + w = bottom.w; + h = bottom.h; + } + else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233) + { + int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w; + int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h; + if (wpad > 0 || hpad > 0) + { + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + + VkMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator); + int* padding_params = padding_param_blob.mapped(); + + padding_params[0] = hpad / 2; + padding_params[1] = hpad - hpad / 2; + padding_params[2] = wpad / 2; + padding_params[3] = wpad - wpad / 2; + padding_params[4] = 0; + padding_params[5] = 0; + + std::vector padding_inputs(2); + padding_inputs[0] = bottom; + padding_inputs[1] = padding_param_blob; + + std::vector padding_outputs(1); + int ret = padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); + if (ret != 0) + return ret; + + bottom = padding_outputs[0]; + w = bottom.w; + h = bottom.h; + } + } + else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234) + { + int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w; + int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h; + if (wpad > 0 || hpad > 0) + { + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + + VkMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator); + int* padding_params = padding_param_blob.mapped(); + + padding_params[0] = hpad - hpad / 2; + padding_params[1] = hpad / 2; + padding_params[2] = wpad - wpad / 2; + padding_params[3] = wpad / 2; + padding_params[4] = 0; + padding_params[5] = 0; + + std::vector padding_inputs(2); + padding_inputs[0] = bottom; + padding_inputs[1] = padding_param_blob; + + std::vector padding_outputs(1); + int ret = padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); + if (ret != 0) + return ret; + + bottom = padding_outputs[0]; + w = bottom.w; + h = bottom.h; + } + } + + const int outw = (w - kernel_extent_w) / stride_w + 1; + const int outh = (h - kernel_extent_h) / stride_h + 1; + + const bool use_int8_requantize = int8_scale_term > 100; + const int out_elempack = opt.use_packing_layout && num_output % 4 == 0 ? 4 : 1; + size_t out_elemsize; + if (use_int8_requantize) + { + out_elemsize = out_elempack; + } + else if (opt.use_fp16_storage || opt.use_fp16_packed || opt.use_bf16_storage || opt.use_bf16_packed) + { + out_elemsize = (size_t)2u * out_elempack; + } + else + { + out_elemsize = (size_t)4u * out_elempack; + } + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + if (use_winograd) + { + bool pre_winograd43 = opt.use_winograd43_convolution; + const int w_bordered = w; + const int h_bordered = h; + if (opt.use_winograd23_convolution) + { + if (vkdev->info.type() == 0 && ((w_bordered <= 18 && h_bordered <= 18) || ((w_bordered >= 23 && w_bordered <= 24) && (h_bordered >= 23 && h_bordered <= 24)))) + pre_winograd43 = false; + if (vkdev->info.type() != 0 && (w_bordered <= 12 && h_bordered <= 12)) + pre_winograd43 = false; + } + + const int B = pre_winograd43 ? 36 : 16; + const int c4 = (channels + 3) / 4; + const int block_x = pre_winograd43 ? (outw + 3) / 4 : (outw + 1) / 2; + const int block_y = pre_winograd43 ? (outh + 3) / 4 : (outh + 1) / 2; + + VkMat bottom_tm_blob; + VkMat bottom_tm_blob_low; + VkMat bottom_tm_blob_high; + { + if (use_cooperative_matrix) + { + if (elempack == 4) + { + bottom_tm_blob_low.create(block_x * block_y * 2, 1, c4 * B, (size_t)4u, 4, opt.workspace_vkallocator); + bottom_tm_blob_high = bottom_tm_blob_low; + } + else + { + bottom_tm_blob_low.create(block_x * block_y, 1, channels * B, (size_t)1u, 1, opt.workspace_vkallocator); + bottom_tm_blob_high.create(block_x * block_y, 1, channels * B, (size_t)1u, 1, opt.workspace_vkallocator); + } + if (bottom_tm_blob_low.empty() || bottom_tm_blob_high.empty()) + return -100; + } + else + { + if (elempack == 4) + bottom_tm_blob.create(block_x * block_y, 1, c4 * B, (size_t)8u, 4, opt.workspace_vkallocator); + else + bottom_tm_blob.create(block_x * block_y, 1, channels * B, (size_t)2u, 1, opt.workspace_vkallocator); + if (bottom_tm_blob.empty()) + return -100; + } + + std::vector bindings(use_cooperative_matrix && elempack == 1 ? 3 : 2); + bindings[0] = bottom; + if (use_cooperative_matrix) + { + bindings[1] = bottom_tm_blob_low; + if (elempack == 1) + bindings[2] = bottom_tm_blob_high; + } + else + { + bindings[1] = bottom_tm_blob; + } + + std::vector constants(7); + constants[0].i = bottom.w; + constants[1].i = bottom.h; + constants[2].i = bottom.cstep; + constants[3].i = use_cooperative_matrix ? bottom_tm_blob_low.cstep : bottom_tm_blob.cstep; + constants[4].i = block_x; + constants[5].i = block_y; + constants[6].i = elempack == 4 ? c4 : channels; + + VkMat dispatcher; + dispatcher.w = block_x; + dispatcher.h = block_y; + dispatcher.c = use_cooperative_matrix ? bottom_tm_blob_low.c / B : bottom_tm_blob.c / B; + + const Pipeline* pipeline = pre_winograd43 ? pipeline_convolution_3x3s1d1_winograd43_transform_input : pipeline_convolution_3x3s1d1_winograd23_transform_input; + cmd.record_pipeline(pipeline, bindings, constants, dispatcher); + } + + VkMat top_tm_blob; + { + top_tm_blob.create(block_x * block_y, 1, num_output / out_elempack * B, (size_t)4u * out_elempack, out_elempack, opt.workspace_vkallocator); + if (top_tm_blob.empty()) + return -100; + + if (use_cooperative_matrix) + { + std::vector bindings(6); + bindings[0] = bottom_tm_blob_low; + bindings[1] = bottom_tm_blob_high; + bindings[2] = bottom_tm_blob_low; + bindings[3] = top_tm_blob; + bindings[4] = top_tm_blob; + bindings[5] = pre_winograd43 ? weight_data_gpu_tm_winograd43_int8_cm : weight_data_gpu_tm_winograd23_int8_cm; + + std::vector constants(3); + constants[0].i = top_tm_blob.w; + constants[1].i = bottom_tm_blob_low.cstep; + constants[2].i = top_tm_blob.cstep; + + const int blocks_x = (top_tm_blob.w + coopmat_M * UNROLL_SG_M * UNROLL_WG_M - 1) / (coopmat_M * UNROLL_SG_M * UNROLL_WG_M); + const int blocks_y = (num_output + coopmat_N * UNROLL_SG_N * UNROLL_WG_N - 1) / (coopmat_N * UNROLL_SG_N * UNROLL_WG_N); + + VkMat dispatcher; + dispatcher.w = (blocks_x * blocks_y) * (coopmat_subgroup_size * UNROLL_WG_M * UNROLL_WG_N); + dispatcher.h = 1; + dispatcher.c = B; + + const Pipeline* pipeline = pre_winograd43 ? pipeline_convolution_3x3s1d1_winograd43_gemm_int8_cm : pipeline_convolution_3x3s1d1_winograd23_gemm_int8_cm; + cmd.record_pipeline(pipeline, bindings, constants, dispatcher); + } + else + { + std::vector bindings(5); + bindings[0] = bottom_tm_blob; + bindings[1] = bottom_tm_blob; + bindings[2] = top_tm_blob; + bindings[3] = top_tm_blob; + bindings[4] = pre_winograd43 ? weight_data_gpu_tm_winograd43 : weight_data_gpu_tm_winograd23; + + std::vector constants(3); + constants[0].i = bottom_tm_blob.cstep; + constants[1].i = top_tm_blob.w; + constants[2].i = top_tm_blob.cstep; + + VkMat dispatcher; + dispatcher.w = (top_tm_blob.w + 3) / 4; + dispatcher.h = (num_output + 3) / 4; + dispatcher.c = B; + + const Pipeline* pipeline = pre_winograd43 ? pipeline_convolution_3x3s1d1_winograd43_gemm : pipeline_convolution_3x3s1d1_winograd23_gemm; + cmd.record_pipeline(pipeline, bindings, constants, dispatcher); + } + } + + { + std::vector bindings(6); + bindings[0] = top_tm_blob; + bindings[1] = top_blob; + bindings[2] = bias_data_gpu; + bindings[3] = weight_data_int8_descales_gpu; + bindings[4] = top_blob_int8_scales_gpu; + bindings[5] = top_blob; + + std::vector constants(7); + constants[0].i = top_tm_blob.cstep; + constants[1].i = block_x; + constants[2].i = block_y; + constants[3].i = top_blob.w; + constants[4].i = top_blob.h; + constants[5].i = top_blob.cstep; + constants[6].i = num_output; + + VkMat dispatcher; + dispatcher.w = block_x; + dispatcher.h = block_y; + dispatcher.c = top_blob.c; + + const Pipeline* pipeline = pre_winograd43 ? pipeline_convolution_3x3s1d1_winograd43_transform_output : pipeline_convolution_3x3s1d1_winograd23_transform_output; + cmd.record_pipeline(pipeline, bindings, constants, dispatcher); + } + } + else if (is_conv1x1s1d1) + { + if (use_cooperative_matrix) + { + std::vector bindings_1x1(7); + bindings_1x1[0] = bottom; + bindings_1x1[1] = top_blob; + bindings_1x1[2] = top_blob; + bindings_1x1[3] = weight_data_gpu; + bindings_1x1[4] = bias_data_gpu; + bindings_1x1[5] = weight_data_int8_descales_gpu; + bindings_1x1[6] = top_blob_int8_scales_gpu; + + const int size = top_blob.w * top_blob.h; + + std::vector constants_1x1(5); + constants_1x1[0].u32 = bottom.cstep; + constants_1x1[1].u32 = top_blob.cstep; + constants_1x1[2].u32 = size; + constants_1x1[3].u32 = num_output; + constants_1x1[4].u32 = num_input; + + const int blocks_x = (size + coopmat_M * UNROLL_SG_M * UNROLL_WG_M - 1) / (coopmat_M * UNROLL_SG_M * UNROLL_WG_M); + const int blocks_y = (num_output + coopmat_N * UNROLL_SG_N * UNROLL_WG_N - 1) / (coopmat_N * UNROLL_SG_N * UNROLL_WG_N); + + VkMat dispatcher; + dispatcher.w = (blocks_x * blocks_y) * (coopmat_subgroup_size * UNROLL_WG_M * UNROLL_WG_N); + dispatcher.h = 1; + dispatcher.c = 1; + + cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings_1x1, constants_1x1, dispatcher); + } + else + { + std::vector bindings_1x1(7); + bindings_1x1[0] = bottom; + bindings_1x1[1] = top_blob; + bindings_1x1[2] = weight_data_gpu; + bindings_1x1[3] = bias_data_gpu; + bindings_1x1[4] = weight_data_int8_descales_gpu; + bindings_1x1[5] = top_blob_int8_scales_gpu; + bindings_1x1[6] = top_blob; + + const int num_input_packed = (num_input + 3) / 4 * 4; + const int num_output_packed = (num_output + 3) / 4 * 4; + const int outc_pack4 = num_output_packed / 4; + const int c_packed = num_input_packed / 4; + const int cstep_vec4 = bottom.elempack == 4 ? bottom.cstep : bottom.cstep / 4; + const int size = (top_blob.w * top_blob.h + 3) / 4; + const int outcstep_vec4 = out_elempack == 4 ? top_blob.cstep : top_blob.cstep / 4; + const int outcstep_native = top_blob.cstep / 4; + + std::vector constants_1x1(6); + constants_1x1[0].i = c_packed; + constants_1x1[1].i = cstep_vec4; + constants_1x1[2].i = outc_pack4; + constants_1x1[3].i = outcstep_vec4; + constants_1x1[4].i = outcstep_native; + constants_1x1[5].i = size; + + VkMat dispatcher; + dispatcher.w = size; + dispatcher.h = outc_pack4; + dispatcher.c = 1; + + cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings_1x1, constants_1x1, dispatcher); + } + } + else if (use_gemm) + { + if (use_cooperative_matrix) + { + std::vector bindings(7); + bindings[0] = bottom; + bindings[1] = top_blob; + bindings[2] = top_blob; + bindings[3] = weight_data_gpu; + bindings[4] = bias_data_gpu; + bindings[5] = weight_data_int8_descales_gpu; + bindings[6] = top_blob_int8_scales_gpu; + + std::vector constants(8); + constants[0].u32 = bottom.w; + constants[1].u32 = bottom.h; + constants[2].u32 = bottom.cstep; + constants[3].u32 = top_blob.w; + constants[4].u32 = top_blob.h; + constants[5].u32 = top_blob.cstep; + constants[6].u32 = num_output; + constants[7].u32 = num_input; + + const int size = top_blob.w * top_blob.h; + const int blocks_x = (size + coopmat_M * UNROLL_SG_M * UNROLL_WG_M - 1) / (coopmat_M * UNROLL_SG_M * UNROLL_WG_M); + const int blocks_y = (num_output + coopmat_N * UNROLL_SG_N * UNROLL_WG_N - 1) / (coopmat_N * UNROLL_SG_N * UNROLL_WG_N); + + VkMat dispatcher; + dispatcher.w = (blocks_x * blocks_y) * (coopmat_subgroup_size * UNROLL_WG_M * UNROLL_WG_N); + dispatcher.h = 1; + dispatcher.c = 1; + + cmd.record_pipeline(pipeline_convolution_gemm, bindings, constants, dispatcher); + } + else + { + const int c_packed = (num_input + 3) / 4; + const int outc_pack4 = (num_output + 3) / 4; + + std::vector bindings(10); + bindings[0] = bottom; + bindings[1] = top_blob; + bindings[2] = bottom; + bindings[3] = top_blob; + bindings[4] = weight_data_gpu; + bindings[5] = bias_data_gpu; + bindings[6] = weight_data_int8_descales_gpu; + bindings[7] = top_blob_int8_scales_gpu; + bindings[8] = top_blob; + bindings[9] = top_blob; + + std::vector constants(8); + constants[0].i = bottom.w; + constants[1].i = bottom.h; + constants[2].i = c_packed; + constants[3].i = bottom.cstep; + constants[4].i = top_blob.w; + constants[5].i = top_blob.h; + constants[6].i = outc_pack4; + constants[7].i = out_elempack == 4 ? top_blob.cstep : top_blob.cstep * 4; + + VkMat dispatcher; + dispatcher.w = (top_blob.w * top_blob.h + 3) / 4; + dispatcher.h = outc_pack4; + dispatcher.c = 1; + + cmd.record_pipeline(pipeline_convolution_gemm, bindings, constants, dispatcher); + } + } + else + { + const int c_packed = (num_input + 3) / 4; + const int c_shader = bottom.elempack == 4 ? c_packed : num_input; + const int outc_pack4 = (num_output + 3) / 4; + + std::vector bindings(10); + bindings[0] = bottom; + bindings[1] = top_blob; + bindings[2] = bottom; + bindings[3] = top_blob; + bindings[4] = weight_data_gpu; + bindings[5] = bias_data_gpu; + bindings[6] = weight_data_int8_descales_gpu; + bindings[7] = top_blob_int8_scales_gpu; + bindings[8] = top_blob; + bindings[9] = top_blob; + + std::vector constants(8); + constants[0].i = bottom.w; + constants[1].i = bottom.h; + constants[2].i = c_shader; + constants[3].i = bottom.cstep; + constants[4].i = top_blob.w; + constants[5].i = top_blob.h; + constants[6].i = outc_pack4; + constants[7].i = out_elempack == 4 ? top_blob.cstep : top_blob.cstep * 4; + + VkMat dispatcher; + dispatcher.w = (top_blob.w + 1) / 2; + dispatcher.h = (top_blob.h + 1) / 2; + dispatcher.c = (outc_pack4 + 1) / 2; + + cmd.record_pipeline(pipeline_convolution, bindings, constants, dispatcher); + } + + return 0; +} +#endif // NCNN_INT8 + } // namespace ncnn diff --git a/src/layer/vulkan/convolution_vulkan.h b/src/layer/vulkan/convolution_vulkan.h index eeb6bba07d26..d03c095f11d4 100644 --- a/src/layer/vulkan/convolution_vulkan.h +++ b/src/layer/vulkan/convolution_vulkan.h @@ -23,6 +23,13 @@ class Convolution_vulkan : public Convolution using Convolution::forward; virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; +protected: +#if NCNN_INT8 + int create_pipeline_int8(const Option& opt); + int upload_model_int8(VkTransfer& cmd, const Option& opt); + int forward_int8(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; +#endif + public: ncnn::Layer* padding; @@ -35,7 +42,6 @@ class Convolution_vulkan : public Convolution Pipeline* pipeline_convolution; Pipeline* pipeline_convolution_1x1s1d1; - Pipeline* pipeline_convolution_gemm; // winograd23 and winograd43 @@ -64,6 +70,26 @@ class Convolution_vulkan : public Convolution int UNROLL_SG_K; int UNROLL_WG_M; int UNROLL_WG_N; + +#if NCNN_INT8 + ncnn::Layer* quantize; + + VkMat weight_data_gpu_tm_winograd23_int8_cm; + VkMat weight_data_gpu_tm_winograd43_int8_cm; + Pipeline* pipeline_convolution_3x3s1d1_winograd23_gemm_int8_cm; + Pipeline* pipeline_convolution_3x3s1d1_winograd43_gemm_int8_cm; + + Mat weight_data_int8_packed; + Mat weight_winograd23_data_int8_packed; + Mat weight_winograd23_data_int8_packed_cm; + Mat weight_winograd43_data_int8_packed; + Mat weight_winograd43_data_int8_packed_cm; + Mat weight_data_int8_descales; + Mat bias_data_int8_packed; + + VkMat weight_data_int8_descales_gpu; + VkMat top_blob_int8_scales_gpu; +#endif }; } // namespace ncnn diff --git a/src/layer/vulkan/convolutiondepthwise_vulkan.cpp b/src/layer/vulkan/convolutiondepthwise_vulkan.cpp index 44e5976007c8..c41606fa0e54 100644 --- a/src/layer/vulkan/convolutiondepthwise_vulkan.cpp +++ b/src/layer/vulkan/convolutiondepthwise_vulkan.cpp @@ -5,6 +5,9 @@ #include "layer_shader_type.h" #include "layer_type.h" +#include "modelbin.h" + +#include namespace ncnn { @@ -22,6 +25,10 @@ ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan() pipeline_convolutiondepthwise_group_pack4 = 0; pipeline_convolutiondepthwise_group_pack1to4 = 0; pipeline_convolutiondepthwise_group_pack4to1 = 0; + +#if NCNN_INT8 + quantize = 0; +#endif } int ConvolutionDepthWise_vulkan::load_param(const ParamDict& pd) @@ -33,11 +40,26 @@ int ConvolutionDepthWise_vulkan::load_param(const ParamDict& pd) support_vulkan = false; } +#if NCNN_INT8 + if (int8_scale_term && pad_value != 0.f) + { + NCNN_LOGE("ConvolutionDepthWise_vulkan int8 nonzero pad value is not supported"); + support_vulkan = false; + } +#endif + return ret; } int ConvolutionDepthWise_vulkan::create_pipeline(const Option& opt) { +#if NCNN_INT8 + if (int8_scale_term) + { + return create_pipeline_int8(opt); + } +#endif + const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0]; const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0]; @@ -310,11 +332,27 @@ int ConvolutionDepthWise_vulkan::destroy_pipeline(const Option& opt) delete pipeline_convolutiondepthwise_group_pack4to1; pipeline_convolutiondepthwise_group_pack4to1 = 0; +#if NCNN_INT8 + if (quantize) + { + quantize->destroy_pipeline(opt); + delete quantize; + quantize = 0; + } +#endif + return 0; } int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& opt) { +#if NCNN_INT8 + if (int8_scale_term) + { + return upload_model_int8(cmd, opt); + } +#endif + if (padding) { padding->upload_model(cmd, opt); @@ -349,6 +387,13 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& opt int ConvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const { +#if NCNN_INT8 + if (int8_scale_term) + { + return forward_int8(bottom_blob, top_blob, cmd, opt); + } +#endif + int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; @@ -539,4 +584,801 @@ int ConvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_bl return 0; } +#if NCNN_INT8 +int ConvolutionDepthWise_vulkan::create_pipeline_int8(const Option& opt) +{ + Mat shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0]; + Mat out_shape = top_shapes.empty() ? Mat() : top_shapes[0]; + + if (shape.dims != 3) shape = Mat(); + if (out_shape.dims != 3) out_shape = Mat(); + + const int maxk = kernel_w * kernel_h; + if (group == 0 || num_output % group != 0) + { + NCNN_LOGE("ConvolutionDepthWise_vulkan int8 invalid group"); + return -1; + } + + const int num_output_g = num_output / group; + const int weight_data_size_g = group * maxk * num_output_g; + if (weight_data_size_g == 0 || weight_data_size % weight_data_size_g != 0) + { + NCNN_LOGE("ConvolutionDepthWise_vulkan int8 weight shape mismatch"); + return -1; + } + + int channels = weight_data_size / weight_data_size_g * group; + const bool is_depthwise = channels == group && group == num_output; + const int channels_g = channels / group; + const int elempack = is_depthwise && opt.use_packing_layout && group % 4 == 0 ? 4 : 1; + const int elempack_g = !is_depthwise && opt.use_packing_layout && channels_g % 4 == 0 ? 4 : 1; + const int out_elempack_g = !is_depthwise && opt.use_packing_layout && num_output_g % 4 == 0 ? 4 : 1; + const int bottom_elempack = is_depthwise ? elempack : elempack_g; + const int num_output_g_pack4 = (num_output_g + 3) / 4; + const int num_output_g_pack4_aligned = (num_output_g_pack4 + 7) / 8 * 8; + + if (weight_data.elemsize != (size_t)1u) + { + NCNN_LOGE("ConvolutionDepthWise_vulkan int8 weight data is not int8"); + return -1; + } + + Option opt_int8 = opt; + opt_int8.use_fp16_arithmetic = false; + opt_int8.use_int16_packed = false; + opt_int8.use_int16_storage = false; + + Mat shape_int8; + if (shape.dims == 3) + { + shape_int8 = Mat(shape.w, shape.h, channels / bottom_elempack, (void*)0, (size_t)bottom_elempack, bottom_elempack); + } + + Mat shape_int8_bordered; + if (shape_int8.dims == 3) + { + if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0) + { + shape_int8_bordered = Mat(shape_int8.w + pad_left + pad_right, shape_int8.h + pad_top + pad_bottom, shape_int8.c, (void*)0, shape_int8.elemsize, shape_int8.elempack); + } + else if ((pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233) + || (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234)) + { + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + int wpad = kernel_extent_w + (shape_int8.w - 1) / stride_w * stride_w - shape_int8.w; + int hpad = kernel_extent_h + (shape_int8.h - 1) / stride_h * stride_h - shape_int8.h; + if (wpad > 0 || hpad > 0) + { + shape_int8_bordered = Mat(shape_int8.w + wpad, shape_int8.h + hpad, shape_int8.c, (void*)0, shape_int8.elemsize, shape_int8.elempack); + } + else + { + shape_int8_bordered = shape_int8; + } + } + else + { + shape_int8_bordered = shape_int8; + } + } + + Mat shape_padding_int8_bordered; + if (shape_int8_bordered.dims == 3) + { + const int padding_outc = shape_int8_bordered.c * shape_int8_bordered.elempack; + const int padding_out_elempack = padding_outc % 4 == 0 ? 4 : 1; + const size_t padding_out_elemsize = shape_int8_bordered.elemsize / shape_int8_bordered.elempack * padding_out_elempack; + shape_padding_int8_bordered = Mat(shape_int8_bordered.w, shape_int8_bordered.h, padding_outc / padding_out_elempack, (void*)0, padding_out_elemsize, padding_out_elempack); + } + + { + quantize = ncnn::create_layer_vulkan(ncnn::LayerType::Quantize); + quantize->vkdev = vkdev; + + Mat shape_quantize; + Mat out_shape_quantize; + if (shape.dims == 3) + { + size_t shape_elemsize = shape.elemsize; + if (shape.elempack != bottom_elempack) + shape_elemsize = shape.elemsize / shape.elempack * bottom_elempack; + + shape_quantize = Mat(shape.w, shape.h, channels / bottom_elempack, (void*)0, shape_elemsize, bottom_elempack); + out_shape_quantize = shape_int8; + } + + quantize->bottom_shapes.resize(1); + quantize->bottom_shapes[0] = shape_quantize; + quantize->top_shapes.resize(1); + quantize->top_shapes[0] = out_shape_quantize; + + ncnn::ParamDict pd; + pd.set(0, 1); + quantize->load_param(pd); + + Mat weights[1]; + weights[0] = bottom_blob_int8_scales; + quantize->load_model(ModelBinFromMatArray(weights)); + + quantize->create_pipeline(opt_int8); + } + + { + padding = ncnn::create_layer_vulkan(ncnn::LayerType::Padding); + padding->vkdev = vkdev; + + padding->bottom_shapes.resize(1); + padding->bottom_shapes[0] = shape_int8; + padding->top_shapes.resize(1); + padding->top_shapes[0] = shape_padding_int8_bordered; + + ncnn::ParamDict pd; + pd.set(0, pad_top); + pd.set(1, pad_bottom); + pd.set(2, pad_left); + pd.set(3, pad_right); + pd.set(4, 0); + pd.set(5, 0.f); + + padding->load_param(pd); + + padding->create_pipeline(opt); + } + + if (is_depthwise) + { + const int maxk4 = (maxk + 3) / 4 * 4; + + if (elempack == 4) + { + const Mat weight_data_r2 = weight_data.reshape(maxk, group); + + weight_data_int8_packed.create(maxk4 / 4, group / 4, (size_t)16u, 1); + memset(weight_data_int8_packed.data, 0, weight_data_int8_packed.total() * weight_data_int8_packed.elemsize); + + for (int q = 0; q + 3 < group; q += 4) + { + signed char* g00 = weight_data_int8_packed.row(q / 4); + const signed char* k0 = weight_data_r2.row(q); + const signed char* k1 = weight_data_r2.row(q + 1); + const signed char* k2 = weight_data_r2.row(q + 2); + const signed char* k3 = weight_data_r2.row(q + 3); + + for (int k = 0; k < maxk4; k += 4) + { + signed char* g0 = g00 + k * 4; + signed char* g1 = g0 + 4; + signed char* g2 = g1 + 4; + signed char* g3 = g2 + 4; + + for (int i = 0; i < 4 && k + i < maxk; i++) + { + g0[i] = k0[k + i]; + g1[i] = k1[k + i]; + g2[i] = k2[k + i]; + g3[i] = k3[k + i]; + } + } + } + } + else + { + const Mat weight_data_r2 = weight_data.reshape(maxk, group); + + weight_data_int8_packed.create(maxk4 / 4, group, (size_t)4u, 4); + weight_data_int8_packed.fill(0); + + for (int q = 0; q < group; q++) + { + const signed char* k0 = weight_data_r2.row(q); + signed char* g00 = weight_data_int8_packed.row(q); + + for (int k = 0; k < maxk; k++) + { + g00[k] = k0[k]; + } + } + } + } + else + { + const Mat weight_data_r2_groups = weight_data.reshape(maxk, channels_g, num_output_g * group); + + if (elempack_g == 4) + { + weight_data_int8_packed.create(maxk, channels_g / 4, num_output_g_pack4_aligned * group, (size_t)16u, 16); + weight_data_int8_packed.fill(0); + + for (int g = 0; g < group; g++) + { + const Mat weight_data_r2 = weight_data_r2_groups.channel_range(num_output_g * g, num_output_g); + + for (int q = 0; q < num_output_g; q += 4) + { + const int outch_pack = std::min(4, num_output_g - q); + Mat weight_data_packed = weight_data_int8_packed.channel(g * num_output_g_pack4_aligned + q / 4); + + for (int p = 0; p < channels_g; p += 4) + { + for (int k = 0; k < maxk; k++) + { + signed char* g00 = weight_data_packed.row(p / 4) + k * 16; + + for (int i = 0; i < outch_pack; i++) + { + const signed char* k0 = weight_data_r2.channel(q + i).row(p); + const signed char* k1 = weight_data_r2.channel(q + i).row(p + 1); + const signed char* k2 = weight_data_r2.channel(q + i).row(p + 2); + const signed char* k3 = weight_data_r2.channel(q + i).row(p + 3); + + g00[i * 4 + 0] = k0[k]; + g00[i * 4 + 1] = k1[k]; + g00[i * 4 + 2] = k2[k]; + g00[i * 4 + 3] = k3[k]; + } + } + } + } + } + } + else + { + weight_data_int8_packed.create(maxk * channels_g, num_output_g_pack4_aligned * group, (size_t)4u, 4); + weight_data_int8_packed.fill(0); + + for (int g = 0; g < group; g++) + { + const Mat weight_data_r2 = weight_data_r2_groups.channel_range(num_output_g * g, num_output_g); + + for (int q = 0; q < num_output_g; q += 4) + { + const int outch_pack = std::min(4, num_output_g - q); + signed char* g00 = weight_data_int8_packed.row(g * num_output_g_pack4_aligned + q / 4); + + for (int p = 0; p < channels_g; p++) + { + for (int k = 0; k < maxk; k++) + { + signed char* g00p = g00 + (p * maxk + k) * 4; + + for (int i = 0; i < outch_pack; i++) + { + const signed char* k0 = weight_data_r2.channel(q + i).row(p); + g00p[i] = k0[k]; + } + } + } + } + } + } + } + + const bool use_int8_requantize = int8_scale_term > 100; + + if (is_depthwise) + { + if (elempack == 4) + { + weight_data_int8_descales.create(group / 4, (size_t)16u, 4); + + float* outptr = weight_data_int8_descales; + for (int g = 0; g < group; g++) + { + const float bottom_scale = bottom_blob_int8_scales[g]; + const float weight_scale = weight_data_int8_scales[g]; + outptr[g] = bottom_scale == 0.f || weight_scale == 0.f ? 0.f : 1.f / (bottom_scale * weight_scale); + } + } + else + { + weight_data_int8_descales.create(group, (size_t)4u, 1); + + float* outptr = weight_data_int8_descales; + for (int g = 0; g < group; g++) + { + const float bottom_scale = bottom_blob_int8_scales[g]; + const float weight_scale = weight_data_int8_scales[g]; + outptr[g] = bottom_scale == 0.f || weight_scale == 0.f ? 0.f : 1.f / (bottom_scale * weight_scale); + } + } + } + else + { + weight_data_int8_descales.create(num_output_g_pack4_aligned * group, (size_t)16u, 4); + weight_data_int8_descales.fill(0.f); + + float* outptr = weight_data_int8_descales; + for (int g = 0; g < group; g++) + { + const float bottom_scale = bottom_blob_int8_scales[g]; + const float weight_scale = weight_data_int8_scales[g]; + const float descale = bottom_scale == 0.f || weight_scale == 0.f ? 0.f : 1.f / (bottom_scale * weight_scale); + + for (int q = 0; q < num_output_g; q++) + { + outptr[g * num_output_g_pack4_aligned * 4 + q] = descale; + } + } + } + + if (use_int8_requantize) + { + if (is_depthwise) + { + if (elempack == 4) + { + top_blob_int8_scales_packed.create(group / 4, (size_t)16u, 4); + + float* outptr = top_blob_int8_scales_packed; + for (int g = 0; g < group; g++) + { + outptr[g] = top_blob_int8_scales[g]; + } + } + else + { + top_blob_int8_scales_packed = top_blob_int8_scales; + } + } + else + { + top_blob_int8_scales_packed.create(num_output_g_pack4_aligned * group, (size_t)16u, 4); + top_blob_int8_scales_packed.fill(0.f); + + float* outptr = top_blob_int8_scales_packed; + for (int g = 0; g < group; g++) + { + const float top_scale = top_blob_int8_scales[g]; + + for (int q = 0; q < num_output_g; q++) + { + outptr[g * num_output_g_pack4_aligned * 4 + q] = top_scale; + } + } + } + } + + if (bias_term) + { + if (is_depthwise) + { + if (elempack == 4) + { + bias_data_int8_packed.create(num_output / 4, (size_t)16u, 4); + bias_data_int8_packed.fill(0.f); + + float* outptr = bias_data_int8_packed; + for (int q = 0; q < num_output; q++) + { + outptr[q] = bias_data[q]; + } + } + else + { + bias_data_int8_packed = bias_data; + } + } + else + { + bias_data_int8_packed.create(num_output_g_pack4_aligned * group, (size_t)16u, 4); + bias_data_int8_packed.fill(0.f); + + float* outptr = bias_data_int8_packed; + for (int q = 0; q < num_output; q++) + { + const int g = q / num_output_g; + const int qg = q - g * num_output_g; + outptr[g * num_output_g_pack4_aligned * 4 + qg] = bias_data[q]; + } + } + } + + const int out_elempack = opt.use_packing_layout && num_output % 4 == 0 ? 4 : 1; + const bool use_sfp_output = !use_int8_requantize && (opt.use_fp16_storage || opt.use_fp16_packed || opt.use_bf16_storage || opt.use_bf16_packed); + size_t out_elemsize; + if (use_int8_requantize) + { + out_elemsize = out_elempack; + } + else if (use_sfp_output) + { + out_elemsize = (size_t)2u * out_elempack; + } + else + { + out_elemsize = (size_t)4u * out_elempack; + } + + size_t out_elemsize_g; + if (use_int8_requantize) + { + out_elemsize_g = out_elempack_g; + } + else if (use_sfp_output) + { + out_elemsize_g = (size_t)2u * out_elempack_g; + } + else + { + out_elemsize_g = (size_t)4u * out_elempack_g; + } + + Mat out_shape_int8; + if (out_shape.dims == 3) + out_shape_int8 = Mat(out_shape.w, out_shape.h, num_output / out_elempack, (void*)0, out_elemsize, out_elempack); + + Mat out_shape_int8_g; + if (out_shape.dims == 3) + out_shape_int8_g = Mat(out_shape.w, out_shape.h, num_output / out_elempack_g, (void*)0, out_elemsize_g, out_elempack_g); + + std::vector specializations(12 + 10); + specializations[0].i = kernel_w; + specializations[1].i = kernel_h; + specializations[2].i = dilation_w; + specializations[3].i = dilation_h; + specializations[4].i = stride_w; + specializations[5].i = stride_h; + specializations[6].i = bias_term; + specializations[7].i = group; + specializations[8].i = activation_type; + specializations[9].f = activation_params.w >= 1 ? activation_params[0] : 0.f; + specializations[10].f = activation_params.w == 2 ? activation_params[1] : 0.f; + specializations[11].i = use_int8_requantize ? 1 : 0; + + if (is_depthwise) + { + specializations[12 + 0].i = shape_int8_bordered.dims; + specializations[12 + 1].i = shape_int8_bordered.w; + specializations[12 + 2].i = shape_int8_bordered.h; + specializations[12 + 3].i = shape_int8_bordered.c; + specializations[12 + 4].i = shape_int8_bordered.cstep; + specializations[12 + 5].i = out_shape_int8.dims; + specializations[12 + 6].i = out_shape_int8.w; + specializations[12 + 7].i = out_shape_int8.h; + specializations[12 + 8].i = out_shape_int8.c; + specializations[12 + 9].i = out_shape_int8.cstep; + + Mat local_size_xyz(8, 8, std::min(4, num_output / out_elempack), (void*)0); + if (out_shape.dims != 0) + { + local_size_xyz.w = std::min(8, out_shape.w); + local_size_xyz.h = std::min(8, out_shape.h); + local_size_xyz.c = std::min(4, out_shape_int8.c); + } + + if (opt.use_packing_layout && group % 4 == 0) + { + pipeline_convolutiondepthwise_pack4 = new Pipeline(vkdev); + pipeline_convolutiondepthwise_pack4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_convolutiondepthwise_pack4->create(LayerShaderType::convolutiondepthwise_pack4_int8, opt_int8, specializations); + } + else + { + pipeline_convolutiondepthwise = new Pipeline(vkdev); + pipeline_convolutiondepthwise->set_optimal_local_size_xyz(local_size_xyz); + pipeline_convolutiondepthwise->create(LayerShaderType::convolutiondepthwise_int8, opt_int8, specializations); + } + } + else + { + std::vector specializations_group(15 + 10); + for (int i = 0; i < 12; i++) + { + specializations_group[i] = specializations[i]; + } + specializations_group[12].i = elempack_g; + specializations_group[13].i = out_elempack_g; + specializations_group[14].i = num_output_g; + for (int i = 0; i < 10; i++) + { + specializations_group[15 + i].i = 0; + } + + Mat local_size_xyz(8, 8, 1, (void*)0); + if (out_shape.dims != 0) + { + local_size_xyz.w = std::min(8, out_shape.w); + local_size_xyz.h = std::min(8, out_shape.h); + local_size_xyz.c = std::min(4, group * num_output_g_pack4); + } + + pipeline_convolutiondepthwise_group = new Pipeline(vkdev); + pipeline_convolutiondepthwise_group->set_optimal_local_size_xyz(local_size_xyz); + pipeline_convolutiondepthwise_group->create(LayerShaderType::convolutiondepthwise_group_packed_int8, opt_int8, specializations_group); + } + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; +} + +int ConvolutionDepthWise_vulkan::upload_model_int8(VkTransfer& cmd, const Option& opt) +{ + Option opt_fp32 = opt; + opt_fp32.use_fp16_packed = false; + opt_fp32.use_fp16_storage = false; + opt_fp32.use_bf16_packed = false; + opt_fp32.use_bf16_storage = false; + + cmd.record_upload(weight_data_int8_packed, weight_data_gpu, opt); + + weight_data_int8_packed.release(); + + cmd.record_upload(weight_data_int8_descales, weight_data_int8_descales_gpu, opt_fp32); + + weight_data_int8_descales.release(); + + const bool use_int8_requantize = int8_scale_term > 100; + if (use_int8_requantize) + { + cmd.record_upload(top_blob_int8_scales_packed, top_blob_int8_scales_gpu, opt); + + top_blob_int8_scales_packed.release(); + } + + if (bias_term) + { + cmd.record_upload(bias_data_int8_packed, bias_data_gpu, opt_fp32); + + bias_data_int8_packed.release(); + + bias_data.release(); + } + + if (padding) + { + padding->upload_model(cmd, opt); + } + + quantize->upload_model(cmd, opt); + + return 0; +} + +int ConvolutionDepthWise_vulkan::forward_int8(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const +{ + VkMat bottom = bottom_blob; + bool bottom_is_int8 = bottom.elembits() == 8; + + int channels = bottom.c * bottom.elempack; + const bool is_depthwise = channels == group && group == num_output; + const int channels_g = channels / group; + const int num_output_g = num_output / group; + const int elempack = is_depthwise && opt.use_packing_layout && channels % 4 == 0 ? 4 : 1; + const int elempack_g = !is_depthwise && opt.use_packing_layout && channels_g % 4 == 0 ? 4 : 1; + const int out_elempack_g = !is_depthwise && opt.use_packing_layout && num_output_g % 4 == 0 ? 4 : 1; + const int bottom_elempack = is_depthwise ? elempack : elempack_g; + + Option opt_workspace = opt; + opt_workspace.blob_vkallocator = opt.workspace_vkallocator; + opt_workspace.use_fp16_arithmetic = false; + + if (bottom.elempack != bottom_elempack) + { + VkMat bottom_unpacked; + vkdev->convert_packing(bottom, bottom_unpacked, bottom_elempack, cmd, opt_workspace); + bottom = bottom_unpacked; + } + + if (!bottom_is_int8) + { + VkMat bottom_int8; + int ret = quantize->forward(bottom, bottom_int8, cmd, opt_workspace); + if (ret != 0) + return ret; + + bottom = bottom_int8; + bottom_is_int8 = true; + } + + int w = bottom.w; + int h = bottom.h; + channels = bottom.c * bottom.elempack; + + if (channels % group != 0) + { + NCNN_LOGE("ConvolutionDepthWise_vulkan int8 input channels mismatch"); + return -1; + } + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0) + { + VkMat bottom_bordered; + int ret = padding->forward(bottom, bottom_bordered, cmd, opt_workspace); + if (ret != 0) + return ret; + + bottom = bottom_bordered; + w = bottom.w; + h = bottom.h; + } + else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233) + { + int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w; + int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h; + if (wpad > 0 || hpad > 0) + { + VkMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator); + int* padding_params = padding_param_blob.mapped(); + + padding_params[0] = hpad / 2; + padding_params[1] = hpad - hpad / 2; + padding_params[2] = wpad / 2; + padding_params[3] = wpad - wpad / 2; + padding_params[4] = 0; + padding_params[5] = 0; + + std::vector padding_inputs(2); + padding_inputs[0] = bottom; + padding_inputs[1] = padding_param_blob; + + std::vector padding_outputs(1); + int ret = padding->forward(padding_inputs, padding_outputs, cmd, opt_workspace); + if (ret != 0) + return ret; + + bottom = padding_outputs[0]; + w = bottom.w; + h = bottom.h; + } + } + else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234) + { + int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w; + int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h; + if (wpad > 0 || hpad > 0) + { + VkMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator); + int* padding_params = padding_param_blob.mapped(); + + padding_params[0] = hpad - hpad / 2; + padding_params[1] = hpad / 2; + padding_params[2] = wpad - wpad / 2; + padding_params[3] = wpad / 2; + padding_params[4] = 0; + padding_params[5] = 0; + + std::vector padding_inputs(2); + padding_inputs[0] = bottom; + padding_inputs[1] = padding_param_blob; + + std::vector padding_outputs(1); + int ret = padding->forward(padding_inputs, padding_outputs, cmd, opt_workspace); + if (ret != 0) + return ret; + + bottom = padding_outputs[0]; + w = bottom.w; + h = bottom.h; + } + } + + if (bottom.elempack != bottom_elempack) + { + VkMat bottom_unpacked; + vkdev->convert_packing(bottom, bottom_unpacked, bottom_elempack, cmd, opt_workspace); + bottom = bottom_unpacked; + w = bottom.w; + h = bottom.h; + } + + const int outw = (w - kernel_extent_w) / stride_w + 1; + const int outh = (h - kernel_extent_h) / stride_h + 1; + + const bool use_int8_requantize = int8_scale_term > 100; + const int out_elempack = opt.use_packing_layout && num_output % 4 == 0 ? 4 : 1; + const bool use_sfp_output = !use_int8_requantize && (opt.use_fp16_storage || opt.use_fp16_packed || opt.use_bf16_storage || opt.use_bf16_packed); + size_t out_elemsize; + if (use_int8_requantize) + { + out_elemsize = out_elempack; + } + else if (use_sfp_output) + { + out_elemsize = (size_t)2u * out_elempack; + } + else + { + out_elemsize = (size_t)4u * out_elempack; + } + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + VkMat top_blob_unpacked = top_blob; + if (!is_depthwise && out_elempack_g != out_elempack) + { + size_t out_elemsize_g; + if (use_int8_requantize) + { + out_elemsize_g = out_elempack_g; + } + else if (use_sfp_output) + { + out_elemsize_g = (size_t)2u * out_elempack_g; + } + else + { + out_elemsize_g = (size_t)4u * out_elempack_g; + } + + top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator); + if (top_blob_unpacked.empty()) + return -100; + } + + std::vector constants(10); + constants[0].i = bottom.dims; + constants[1].i = bottom.w; + constants[2].i = bottom.h; + constants[3].i = bottom.c; + constants[4].i = bottom.cstep; + constants[5].i = top_blob_unpacked.dims; + constants[6].i = top_blob_unpacked.w; + constants[7].i = top_blob_unpacked.h; + constants[8].i = top_blob_unpacked.c; + constants[9].i = top_blob_unpacked.cstep; + + if (is_depthwise) + { + std::vector bindings(7); + bindings[0] = bottom; + bindings[1] = top_blob_unpacked; + bindings[2] = weight_data_gpu; + bindings[3] = bias_data_gpu; + bindings[4] = weight_data_int8_descales_gpu; + bindings[5] = top_blob_int8_scales_gpu; + // binding 6 aliases top with int8 SSBO element type + bindings[6] = top_blob_unpacked; + + const Pipeline* pipeline = bottom.elempack == 4 ? pipeline_convolutiondepthwise_pack4 : pipeline_convolutiondepthwise; + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + } + else + { + std::vector bindings(10); + bindings[0] = bottom; + bindings[1] = top_blob_unpacked; + bindings[2] = bottom; + bindings[3] = top_blob_unpacked; + bindings[4] = weight_data_gpu; + bindings[5] = bias_data_gpu; + bindings[6] = weight_data_int8_descales_gpu; + bindings[7] = top_blob_int8_scales_gpu; + bindings[8] = top_blob_unpacked; + bindings[9] = top_blob_unpacked; + + const int num_output_g_pack4 = (num_output_g + 3) / 4; + + std::vector constants_group = constants; + constants_group[8].i = num_output_g_pack4 * group; + constants_group[9].i = out_elempack_g == 4 ? top_blob_unpacked.cstep : top_blob_unpacked.cstep * 4; + + VkMat dispatcher; + dispatcher.w = top_blob_unpacked.w; + dispatcher.h = top_blob_unpacked.h; + dispatcher.c = group * num_output_g_pack4; + + cmd.record_pipeline(pipeline_convolutiondepthwise_group, bindings, constants_group, dispatcher); + + if (out_elempack_g != out_elempack) + { + vkdev->convert_packing(top_blob_unpacked, top_blob, out_elempack, cmd, opt); + } + } + + return 0; +} +#endif // NCNN_INT8 + } // namespace ncnn diff --git a/src/layer/vulkan/convolutiondepthwise_vulkan.h b/src/layer/vulkan/convolutiondepthwise_vulkan.h index da22c82097a0..cf110296ef0a 100644 --- a/src/layer/vulkan/convolutiondepthwise_vulkan.h +++ b/src/layer/vulkan/convolutiondepthwise_vulkan.h @@ -23,6 +23,13 @@ class ConvolutionDepthWise_vulkan : public ConvolutionDepthWise using ConvolutionDepthWise::forward; virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; +protected: +#if NCNN_INT8 + int create_pipeline_int8(const Option& opt); + int upload_model_int8(VkTransfer& cmd, const Option& opt); + int forward_int8(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; +#endif + public: Mat weight_data_packed; Mat weight_data_packed_groups; @@ -34,11 +41,22 @@ class ConvolutionDepthWise_vulkan : public ConvolutionDepthWise Pipeline* pipeline_convolutiondepthwise; Pipeline* pipeline_convolutiondepthwise_pack4; - Pipeline* pipeline_convolutiondepthwise_group; Pipeline* pipeline_convolutiondepthwise_group_pack4; Pipeline* pipeline_convolutiondepthwise_group_pack1to4; Pipeline* pipeline_convolutiondepthwise_group_pack4to1; + +#if NCNN_INT8 + ncnn::Layer* quantize; + + Mat weight_data_int8_packed; + Mat weight_data_int8_descales; + Mat top_blob_int8_scales_packed; + Mat bias_data_int8_packed; + + VkMat weight_data_int8_descales_gpu; + VkMat top_blob_int8_scales_gpu; +#endif }; } // namespace ncnn diff --git a/src/layer/vulkan/flatten_vulkan.cpp b/src/layer/vulkan/flatten_vulkan.cpp index 40483bd670b2..50ec77e7cccb 100644 --- a/src/layer/vulkan/flatten_vulkan.cpp +++ b/src/layer/vulkan/flatten_vulkan.cpp @@ -15,6 +15,11 @@ Flatten_vulkan::Flatten_vulkan() pipeline_flatten = 0; pipeline_flatten_pack4 = 0; pipeline_flatten_pack1to4 = 0; +#if NCNN_INT8 + pipeline_flatten_int8 = 0; + pipeline_flatten_pack4_int8 = 0; + pipeline_flatten_pack1to4_int8 = 0; +#endif } int Flatten_vulkan::create_pipeline(const Option& opt) @@ -42,6 +47,34 @@ int Flatten_vulkan::create_pipeline(const Option& opt) local_size_xyz.c = 1; } +#if NCNN_INT8 + Mat shape_int8; + if (shape.dims == 1) shape_int8 = Mat(shape.w, (void*)0, (size_t)shape.elempack, shape.elempack); + if (shape.dims == 2) shape_int8 = Mat(shape.w, shape.h, (void*)0, (size_t)shape.elempack, shape.elempack); + if (shape.dims == 3) shape_int8 = Mat(shape.w, shape.h, shape.c, (void*)0, (size_t)shape.elempack, shape.elempack); + if (shape.dims == 4) shape_int8 = Mat(shape.w, shape.h, shape.d, shape.c, (void*)0, (size_t)shape.elempack, shape.elempack); + + Mat out_shape_int8; + if (out_shape.dims == 1) out_shape_int8 = Mat(out_shape.w, (void*)0, (size_t)out_shape.elempack, out_shape.elempack); + if (out_shape.dims == 2) out_shape_int8 = Mat(out_shape.w, out_shape.h, (void*)0, (size_t)out_shape.elempack, out_shape.elempack); + if (out_shape.dims == 3) out_shape_int8 = Mat(out_shape.w, out_shape.h, out_shape.c, (void*)0, (size_t)out_shape.elempack, out_shape.elempack); + if (out_shape.dims == 4) out_shape_int8 = Mat(out_shape.w, out_shape.h, out_shape.d, out_shape.c, (void*)0, (size_t)out_shape.elempack, out_shape.elempack); + + std::vector specializations_int8 = specializations; + specializations_int8[0 + 0].i = std::min(3, shape_int8.dims); + specializations_int8[0 + 1].i = shape_int8.w; + specializations_int8[0 + 2].i = shape_int8.h * shape_int8.d; + specializations_int8[0 + 3].i = shape_int8.c; + specializations_int8[0 + 4].i = shape_int8.cstep; + specializations_int8[0 + 5].i = std::min(3, out_shape_int8.dims); + specializations_int8[0 + 6].i = out_shape_int8.w; + specializations_int8[0 + 7].i = out_shape_int8.h * out_shape_int8.d; + specializations_int8[0 + 8].i = out_shape_int8.c; + specializations_int8[0 + 9].i = out_shape_int8.cstep; + + const bool use_int8_pipeline = opt.use_int8_packed || opt.use_int8_storage; +#endif + // pack1 if (shape.dims == 0 || (shape.elempack == 1 && out_shape.elempack == 1)) { @@ -66,6 +99,32 @@ int Flatten_vulkan::create_pipeline(const Option& opt) pipeline_flatten_pack1to4->create(LayerShaderType::flatten_pack1to4, opt, specializations); } +#if NCNN_INT8 + if (use_int8_pipeline) + { + if (shape.dims == 0 || (shape.elempack == 1 && out_shape.elempack == 1)) + { + pipeline_flatten_int8 = new Pipeline(vkdev); + pipeline_flatten_int8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_flatten_int8->create(LayerShaderType::flatten_int8, opt, specializations_int8); + } + + if (shape.dims == 0 || (shape.elempack == 4 && out_shape.elempack == 4)) + { + pipeline_flatten_pack4_int8 = new Pipeline(vkdev); + pipeline_flatten_pack4_int8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_flatten_pack4_int8->create(LayerShaderType::flatten_pack4_int8, opt, specializations_int8); + } + + if (shape.dims == 0 || (shape.elempack == 1 && out_shape.elempack == 4)) + { + pipeline_flatten_pack1to4_int8 = new Pipeline(vkdev); + pipeline_flatten_pack1to4_int8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_flatten_pack1to4_int8->create(LayerShaderType::flatten_pack1to4_int8, opt, specializations_int8); + } + } +#endif + return 0; } @@ -80,6 +139,17 @@ int Flatten_vulkan::destroy_pipeline(const Option& /*opt*/) delete pipeline_flatten_pack1to4; pipeline_flatten_pack1to4 = 0; +#if NCNN_INT8 + delete pipeline_flatten_int8; + pipeline_flatten_int8 = 0; + + delete pipeline_flatten_pack4_int8; + pipeline_flatten_pack4_int8 = 0; + + delete pipeline_flatten_pack1to4_int8; + pipeline_flatten_pack1to4_int8 = 0; +#endif + return 0; } @@ -138,7 +208,28 @@ int Flatten_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute constants[9].i = top_blob.cstep; const Pipeline* pipeline = 0; +#if NCNN_INT8 + if (bottom_blob.elembits() == 8 && elempack == 1 && out_elempack == 1) + { + pipeline = pipeline_flatten_int8; + } + else if (bottom_blob.elembits() == 8 && elempack == 4 && out_elempack == 4) + { + pipeline = pipeline_flatten_pack4_int8; + } + else if (bottom_blob.elembits() == 8 && elempack == 1 && out_elempack == 4) + { + pipeline = pipeline_flatten_pack1to4_int8; + } + else if (elempack == 1 && out_elempack == 1) +#else + if (bottom_blob.elembits() == 8) + { + return -1; + } + if (elempack == 1 && out_elempack == 1) +#endif { pipeline = pipeline_flatten; } @@ -151,6 +242,9 @@ int Flatten_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute pipeline = pipeline_flatten_pack1to4; } + if (!pipeline) + return -1; + cmd.record_pipeline(pipeline, bindings, constants, top_blob); return 0; diff --git a/src/layer/vulkan/flatten_vulkan.h b/src/layer/vulkan/flatten_vulkan.h index e3425385ba99..880a3a31a41e 100644 --- a/src/layer/vulkan/flatten_vulkan.h +++ b/src/layer/vulkan/flatten_vulkan.h @@ -23,6 +23,11 @@ class Flatten_vulkan : public Flatten Pipeline* pipeline_flatten; Pipeline* pipeline_flatten_pack4; Pipeline* pipeline_flatten_pack1to4; +#if NCNN_INT8 + Pipeline* pipeline_flatten_int8; + Pipeline* pipeline_flatten_pack4_int8; + Pipeline* pipeline_flatten_pack1to4_int8; +#endif }; } // namespace ncnn diff --git a/src/layer/vulkan/gemm_vulkan.cpp b/src/layer/vulkan/gemm_vulkan.cpp index 682729c95550..dfc0bb250270 100644 --- a/src/layer/vulkan/gemm_vulkan.cpp +++ b/src/layer/vulkan/gemm_vulkan.cpp @@ -7,6 +7,16 @@ namespace ncnn { +#if NCNN_INT8 +static inline signed char float2int8(float v) +{ + int int32 = static_cast(round(v)); + if (int32 > 127) return 127; + if (int32 < -127) return -127; + return (signed char)int32; +} +#endif // NCNN_INT8 + Gemm_vulkan::Gemm_vulkan() { support_vulkan = true; @@ -14,6 +24,12 @@ Gemm_vulkan::Gemm_vulkan() support_vulkan_any_packing = true; pipeline_gemm = 0; +#if NCNN_INT8 + pipeline_gemm_quantize_A_int8 = 0; + pipeline_gemm_quantize_B_absmax_int8 = 0; + pipeline_gemm_quantize_B_descale_int8 = 0; + pipeline_gemm_quantize_B_int8 = 0; +#endif use_subgroup_ops = false; @@ -29,20 +45,15 @@ Gemm_vulkan::Gemm_vulkan() UNROLL_WG_N = 1; } -int Gemm_vulkan::load_param(const ParamDict& pd) +int Gemm_vulkan::create_pipeline(const Option& opt) { - int ret = Gemm::load_param(pd); - +#if NCNN_INT8 if (int8_scale_term) { - support_vulkan = false; + return create_pipeline_int8(opt); } +#endif - return ret; -} - -int Gemm_vulkan::create_pipeline(const Option& opt) -{ // const Mat& shape = top_shapes.empty() ? Mat() : top_shapes[0]; if (constantA) @@ -599,6 +610,20 @@ int Gemm_vulkan::destroy_pipeline(const Option& /*opt*/) delete pipeline_gemm; pipeline_gemm = 0; +#if NCNN_INT8 + delete pipeline_gemm_quantize_A_int8; + pipeline_gemm_quantize_A_int8 = 0; + + delete pipeline_gemm_quantize_B_absmax_int8; + pipeline_gemm_quantize_B_absmax_int8 = 0; + + delete pipeline_gemm_quantize_B_descale_int8; + pipeline_gemm_quantize_B_descale_int8 = 0; + + delete pipeline_gemm_quantize_B_int8; + pipeline_gemm_quantize_B_int8 = 0; +#endif + use_subgroup_ops = false; use_cooperative_matrix = false; @@ -617,6 +642,13 @@ int Gemm_vulkan::destroy_pipeline(const Option& /*opt*/) int Gemm_vulkan::upload_model(VkTransfer& cmd, const Option& opt) { +#if NCNN_INT8 + if (int8_scale_term) + { + return upload_model_int8(cmd, opt); + } +#endif + if (constantA) { cmd.record_upload(A_data_packed, A_data_gpu, opt); @@ -643,6 +675,13 @@ int Gemm_vulkan::upload_model(VkTransfer& cmd, const Option& opt) int Gemm_vulkan::forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const { +#if NCNN_INT8 + if (int8_scale_term) + { + return forward_int8(bottom_blobs, top_blobs, cmd, opt); + } +#endif + const VkMat& A0 = constantA ? A_data_gpu : bottom_blobs[0]; const VkMat& B0 = constantB ? B_data_gpu : constantA ? bottom_blobs[0] : bottom_blobs[1]; @@ -865,4 +904,749 @@ int Gemm_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& c return ret; } +#if NCNN_INT8 +int Gemm_vulkan::create_pipeline_int8(const Option& opt) +{ + Option opt_int8 = opt; + opt_int8.use_fp16_arithmetic = false; + opt_int8.use_int16_packed = false; + opt_int8.use_int16_storage = false; + + coopmat_M = 0; + coopmat_N = 0; + coopmat_K = 0; + coopmat_subgroup_size = 0; + + use_cooperative_matrix = vkdev->info.support_int8_cooperative_matrix() && opt.use_cooperative_matrix && opt.use_int8_arithmetic; + if (use_cooperative_matrix) + { + int M = constantM ? constantM : 1024; + int N = constantN ? constantN : 1024; + int K = constantK ? constantK : 1024; + + vkdev->info.get_optimal_cooperative_matrix_mnk(M, N, K, VK_COMPONENT_TYPE_SINT8_KHR, VK_COMPONENT_TYPE_SINT32_KHR, VK_SCOPE_SUBGROUP_KHR, coopmat_M, coopmat_N, coopmat_K, coopmat_subgroup_size); + + if (coopmat_M == 0 || coopmat_N == 0 || coopmat_K == 0) + { + use_cooperative_matrix = false; + } + else + { + UNROLL_SG_M = std::min((M + coopmat_M - 1) / coopmat_M, 2); + UNROLL_SG_N = std::min((N + coopmat_N - 1) / coopmat_N, 2); + UNROLL_SG_K = std::min((K + coopmat_K - 1) / coopmat_K, 2); + + UNROLL_WG_M = std::min((M + coopmat_M * UNROLL_SG_M - 1) / (coopmat_M * UNROLL_SG_M), 2); + UNROLL_WG_N = std::min((N + coopmat_N * UNROLL_SG_N - 1) / (coopmat_N * UNROLL_SG_N), 2); + } + } + + if (constantA) + { + A_data_int8_packed.create(constantK, constantM, (size_t)1u, 1); + if (A_data_int8_packed.empty()) + return -100; + + A_data_int8_descales.create(constantM, (size_t)4u, 1); + if (A_data_int8_descales.empty()) + return -100; + + if (A_data.elemsize == (size_t)1u) + { + for (int i = 0; i < constantM; i++) + { + const float scale = A_data_int8_scales[i]; + A_data_int8_descales[i] = scale == 0.f ? 0.f : 1.f / scale; + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < constantM; i++) + { + signed char* outptr = A_data_int8_packed.row(i); + + for (int k = 0; k < constantK; k++) + { + outptr[k] = transA ? A_data.row(k)[i] : A_data.row(i)[k]; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < constantM; i++) + { + float absmax = 0.f; + for (int k = 0; k < constantK; k++) + { + const float v = transA ? A_data.row(k)[i] : A_data.row(i)[k]; + absmax = std::max(absmax, v < 0.f ? -v : v); + } + + const float A_int8_scale = absmax == 0.f ? 1.f : 127.f / absmax; + A_data_int8_descales[i] = absmax == 0.f ? 0.f : absmax * (1.f / 127.f); + + signed char* outptr = A_data_int8_packed.row(i); + + for (int k = 0; k < constantK; k++) + { + const float v = transA ? A_data.row(k)[i] : A_data.row(i)[k]; + outptr[k] = float2int8(v * A_int8_scale); + } + } + } + + if (use_cooperative_matrix) + { + Mat A_data_int8 = A_data_int8_packed; + + const int blocks_m = (constantM + coopmat_M * UNROLL_SG_M * UNROLL_WG_M - 1) / (coopmat_M * UNROLL_SG_M * UNROLL_WG_M); + const int kk = (constantK + coopmat_K - 1) / coopmat_K; + + const int A_data_int8_packed_size = coopmat_M * coopmat_K * UNROLL_SG_M * UNROLL_WG_M * kk; + A_data_int8_packed.create(A_data_int8_packed_size / 4, blocks_m, (size_t)4u, 4); + if (A_data_int8_packed.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int bm = 0; bm < blocks_m; bm++) + { + signed char* p = A_data_int8_packed.row(bm); + + int k = 0; + for (; k + UNROLL_SG_K - 1 < kk; k += UNROLL_SG_K) + { + for (int wm = 0; wm < UNROLL_WG_M; wm++) + { + for (int zk = 0; zk < UNROLL_SG_K; zk++) + { + for (int zm = 0; zm < UNROLL_SG_M; zm++) + { + for (int i = 0; i < coopmat_M; i++) + { + for (int j = 0; j < coopmat_K; j++) + { + const int gmi = ((bm * UNROLL_WG_M + wm) * UNROLL_SG_M + zm) * coopmat_M + i; + const int gki = (k + zk) * coopmat_K + j; + + if (gmi < constantM && gki < constantK) + { + *p++ = A_data_int8.row(gmi)[gki]; + } + else + { + *p++ = 0; + } + } + } + } + } + } + } + for (; k < kk; k++) + { + for (int wm = 0; wm < UNROLL_WG_M; wm++) + { + for (int zm = 0; zm < UNROLL_SG_M; zm++) + { + for (int i = 0; i < coopmat_M; i++) + { + for (int j = 0; j < coopmat_K; j++) + { + const int gmi = ((bm * UNROLL_WG_M + wm) * UNROLL_SG_M + zm) * coopmat_M + i; + const int gki = k * coopmat_K + j; + + if (gmi < constantM && gki < constantK) + { + *p++ = A_data_int8.row(gmi)[gki]; + } + else + { + *p++ = 0; + } + } + } + } + } + } + } + } + } + + if (constantB) + { + B_data_int8_packed.create(constantK, constantN, (size_t)1u, 1); + if (B_data_int8_packed.empty()) + return -100; + + B_data_int8_descales.create(1); + if (B_data_int8_descales.empty()) + return -100; + + if (B_data.elemsize == (size_t)1u) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < constantN; j++) + { + signed char* outptr = B_data_int8_packed.row(j); + + for (int k = 0; k < constantK; k++) + { + outptr[k] = transB ? B_data.row(j)[k] : B_data.row(k)[j]; + } + } + + B_data_int8_descales[0] = B_data_int8_scale == 0.f ? 0.f : 1.f / B_data_int8_scale; + } + else + { + float absmax = 0.f; + for (int j = 0; j < constantN; j++) + { + for (int k = 0; k < constantK; k++) + { + const float v = transB ? B_data.row(j)[k] : B_data.row(k)[j]; + absmax = std::max(absmax, v < 0.f ? -v : v); + } + } + + const float B_int8_scale = absmax == 0.f ? 1.f : 127.f / absmax; + B_data_int8_descales[0] = absmax == 0.f ? 0.f : absmax * (1.f / 127.f); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < constantN; j++) + { + signed char* outptr = B_data_int8_packed.row(j); + + for (int k = 0; k < constantK; k++) + { + const float v = transB ? B_data.row(j)[k] : B_data.row(k)[j]; + outptr[k] = float2int8(v * B_int8_scale); + } + } + } + + if (use_cooperative_matrix) + { + Mat B_data_int8 = B_data_int8_packed; + + const int blocks_n = (constantN + coopmat_N * UNROLL_SG_N * UNROLL_WG_N - 1) / (coopmat_N * UNROLL_SG_N * UNROLL_WG_N); + const int kk = (constantK + coopmat_K - 1) / coopmat_K; + + const int B_data_int8_packed_size = coopmat_N * coopmat_K * UNROLL_SG_N * UNROLL_WG_N * kk; + B_data_int8_packed.create(B_data_int8_packed_size / 4, blocks_n, (size_t)4u, 4); + if (B_data_int8_packed.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int bn = 0; bn < blocks_n; bn++) + { + signed char* p = B_data_int8_packed.row(bn); + + int k = 0; + for (; k + UNROLL_SG_K - 1 < kk; k += UNROLL_SG_K) + { + for (int wn = 0; wn < UNROLL_WG_N; wn++) + { + for (int zk = 0; zk < UNROLL_SG_K; zk++) + { + for (int zn = 0; zn < UNROLL_SG_N; zn++) + { + for (int i = 0; i < coopmat_K; i++) + { + for (int j = 0; j < coopmat_N; j++) + { + const int gni = ((bn * UNROLL_WG_N + wn) * UNROLL_SG_N + zn) * coopmat_N + j; + const int gki = (k + zk) * coopmat_K + i; + + if (gni < constantN && gki < constantK) + { + *p++ = B_data_int8.row(gni)[gki]; + } + else + { + *p++ = 0; + } + } + } + } + } + } + } + for (; k < kk; k++) + { + for (int wn = 0; wn < UNROLL_WG_N; wn++) + { + for (int zn = 0; zn < UNROLL_SG_N; zn++) + { + for (int i = 0; i < coopmat_K; i++) + { + for (int j = 0; j < coopmat_N; j++) + { + const int gni = ((bn * UNROLL_WG_N + wn) * UNROLL_SG_N + zn) * coopmat_N + j; + const int gki = k * coopmat_K + i; + + if (gni < constantN && gki < constantK) + { + *p++ = B_data_int8.row(gni)[gki]; + } + else + { + *p++ = 0; + } + } + } + } + } + } + } + } + } + + if (constantC && constant_broadcast_type_C != -1) + { + C_data_packed = C_data; + } + + if (!constantA) + { + std::vector specializations(1); + specializations[0].i = transA; + + pipeline_gemm_quantize_A_int8 = new Pipeline(vkdev); + pipeline_gemm_quantize_A_int8->set_optimal_local_size_xyz(Mat(64, 1, 1, (void*)0)); + pipeline_gemm_quantize_A_int8->create(LayerShaderType::gemm_quantize_A_int8, opt_int8, specializations); + } + + if (!constantB) + { + std::vector specializations(1); + specializations[0].i = transB; + + pipeline_gemm_quantize_B_absmax_int8 = new Pipeline(vkdev); + pipeline_gemm_quantize_B_absmax_int8->set_local_size_xyz(128, 1, 1); + pipeline_gemm_quantize_B_absmax_int8->create(LayerShaderType::gemm_quantize_B_absmax_int8, opt_int8, specializations); + + pipeline_gemm_quantize_B_descale_int8 = new Pipeline(vkdev); + pipeline_gemm_quantize_B_descale_int8->set_local_size_xyz(128, 1, 1); + pipeline_gemm_quantize_B_descale_int8->create(LayerShaderType::gemm_quantize_B_descale_int8, opt_int8, std::vector()); + + pipeline_gemm_quantize_B_int8 = new Pipeline(vkdev); + pipeline_gemm_quantize_B_int8->set_optimal_local_size_xyz(Mat(64, 1, 1, (void*)0)); + pipeline_gemm_quantize_B_int8->create(LayerShaderType::gemm_quantize_B_int8, opt_int8, specializations); + } + + if (use_cooperative_matrix) + { + int outh = output_transpose ? constantN : constantM; + int out_elempack = outh ? (outh % 4 == 0 ? 4 : 1) : 0; + if (output_elempack) + out_elempack = output_elempack; + + std::vector specializations(11 + 9); + specializations[0].f = alpha; + specializations[1].f = beta; + specializations[2].i = constantA; + specializations[3].i = constantB; + specializations[4].i = constantC; + specializations[5].i = constant_broadcast_type_C; + specializations[6].i = output_transpose; + specializations[7].u32 = constantM; + specializations[8].u32 = constantN; + specializations[9].u32 = constantK; + specializations[10].u32 = out_elempack; + + specializations[11 + 0].u32 = coopmat_M; + specializations[11 + 1].u32 = coopmat_N; + specializations[11 + 2].u32 = coopmat_K; + specializations[11 + 3].u32 = coopmat_subgroup_size; + specializations[11 + 4].u32 = UNROLL_SG_M; + specializations[11 + 5].u32 = UNROLL_SG_N; + specializations[11 + 6].u32 = UNROLL_SG_K; + specializations[11 + 7].u32 = UNROLL_WG_M; + specializations[11 + 8].u32 = UNROLL_WG_N; + + pipeline_gemm = new Pipeline(vkdev); + pipeline_gemm->set_subgroup_size(coopmat_subgroup_size); + pipeline_gemm->set_local_size_xyz(coopmat_subgroup_size * UNROLL_WG_M * UNROLL_WG_N, 1, 1); + pipeline_gemm->create(LayerShaderType::gemm_int8_cm, opt_int8, specializations); + } + else + { + std::vector specializations(5); + specializations[0].f = alpha; + specializations[1].f = beta; + specializations[2].i = constantC; + specializations[3].i = constant_broadcast_type_C; + specializations[4].i = output_transpose; + + pipeline_gemm = new Pipeline(vkdev); + pipeline_gemm->set_local_size_xyz(8, 8, 1); + pipeline_gemm->create(LayerShaderType::gemm_int8, opt_int8, specializations); + } + + if (opt.lightmode) + { + A_data.release(); + B_data.release(); + C_data.release(); + } + + return 0; +} + +int Gemm_vulkan::upload_model_int8(VkTransfer& cmd, const Option& opt) +{ + Option opt_fp32 = opt; + opt_fp32.use_fp16_packed = false; + opt_fp32.use_fp16_storage = false; + opt_fp32.use_bf16_packed = false; + opt_fp32.use_bf16_storage = false; + + if (constantA) + { + cmd.record_upload(A_data_int8_packed, A_data_gpu, opt); + + A_data_int8_packed.release(); + + cmd.record_upload(A_data_int8_descales, A_data_int8_descales_gpu, opt_fp32); + + A_data_int8_descales.release(); + A_data_int8_scales.release(); + } + + if (constantB) + { + cmd.record_upload(B_data_int8_packed, B_data_gpu, opt); + + B_data_int8_packed.release(); + + cmd.record_upload(B_data_int8_descales, B_data_int8_descales_gpu, opt_fp32); + + B_data_int8_descales.release(); + } + + if (constantC && constant_broadcast_type_C != -1) + { + cmd.record_upload(C_data_packed, C_data_gpu, opt); + + C_data_packed.release(); + } + + return 0; +} + +int Gemm_vulkan::forward_int8(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const +{ + const VkMat& A0 = constantA ? A_data_gpu : bottom_blobs[0]; + const VkMat& B0 = constantB ? B_data_gpu : constantA ? bottom_blobs[0] : bottom_blobs[1]; + + VkMat A = A0; + VkMat B = B0; + + // Runtime int8 blobs do not carry scale metadata, so reject before recording int8 pipelines. + if (!constantA && A.elembits() == 8) + { + NCNN_LOGE("Gemm_vulkan int8 dynamic int8 A is not supported without input scale"); + return -1; + } + + if (!constantB && B.elembits() == 8) + { + NCNN_LOGE("Gemm_vulkan int8 dynamic int8 B is not supported without input scale"); + return -1; + } + + if (!constantA && A.elempack != 1) + { + Option opt_pack1 = opt; + opt_pack1.blob_vkallocator = opt.workspace_vkallocator; + + VkMat A_unpacked; + vkdev->convert_packing(A, A_unpacked, 1, cmd, opt_pack1); + A = A_unpacked; + } + + if (!constantB && B.elempack != 1) + { + Option opt_pack1 = opt; + opt_pack1.blob_vkallocator = opt.workspace_vkallocator; + + VkMat B_unpacked; + vkdev->convert_packing(B, B_unpacked, 1, cmd, opt_pack1); + B = B_unpacked; + } + + const int M = constantM ? constantM : transA ? A.w : (A.dims == 3 ? A.c : A.h); + const int K = constantK ? constantK : transA ? (A.dims == 3 ? A.c : A.h) : A.w; + const int N = constantN ? constantN : transB ? (B.dims == 3 ? B.c : B.h) : B.w; + + VkMat C; + int broadcast_type_C = -1; + if (constantC && constant_broadcast_type_C != -1) + { + C = C_data_gpu; + broadcast_type_C = constant_broadcast_type_C; + } + else + { + VkMat C0; + if (constantA && constantB) + { + C0 = bottom_blobs.size() == 1 ? bottom_blobs[0] : VkMat(); + } + else if (constantA) + { + C0 = bottom_blobs.size() == 2 ? bottom_blobs[1] : VkMat(); + } + else if (constantB) + { + C0 = bottom_blobs.size() == 2 ? bottom_blobs[1] : VkMat(); + } + else + { + C0 = bottom_blobs.size() == 3 ? bottom_blobs[2] : VkMat(); + } + + if (!C0.empty()) + { + C = C0; + if (C.elempack != 1) + { + Option opt_pack1 = opt; + opt_pack1.blob_vkallocator = opt.workspace_vkallocator; + + VkMat C_unpacked; + vkdev->convert_packing(C, C_unpacked, 1, cmd, opt_pack1); + C = C_unpacked; + } + + if (C.dims == 1 && C.w == 1) + { + broadcast_type_C = 0; + } + if (C.dims == 1 && C.w == M) + { + broadcast_type_C = 1; + } + if (C.dims == 1 && C.w == N) + { + broadcast_type_C = 4; + } + if (C.dims == 2 && C.w == 1 && C.h == M) + { + broadcast_type_C = 2; + } + if (C.dims == 2 && C.w == N && C.h == M) + { + broadcast_type_C = 3; + } + if (C.dims == 2 && C.w == N && C.h == 1) + { + broadcast_type_C = 4; + } + } + } + + if (!C.empty() && C.elempack != 1) + { + Option opt_pack1 = opt; + opt_pack1.blob_vkallocator = opt.workspace_vkallocator; + + VkMat C_unpacked; + vkdev->convert_packing(C, C_unpacked, 1, cmd, opt_pack1); + C = C_unpacked; + } + + int out_elempack = 1; + if (use_cooperative_matrix) + { + int outh = output_transpose ? N : M; + out_elempack = outh % 4 == 0 ? 4 : 1; + if (output_elempack) + out_elempack = output_elempack; + } + + size_t elemsize; + if (opt.use_fp16_storage || opt.use_fp16_packed || opt.use_bf16_storage || opt.use_bf16_packed) + { + elemsize = 2u; + } + else + { + elemsize = 4u; + } + size_t out_elemsize = elemsize * out_elempack; + + VkMat A_int8 = A; + VkMat A_int8_descales = A_data_int8_descales_gpu; + if (!constantA) + { + A_int8.create(K, M, (size_t)1u, 1, opt.workspace_vkallocator); + if (A_int8.empty()) + return -100; + + A_int8_descales.create(M, (size_t)4u, 1, opt.workspace_vkallocator); + if (A_int8_descales.empty()) + return -100; + + std::vector bindings(3); + bindings[0] = A; + bindings[1] = A_int8; + bindings[2] = A_int8_descales; + + std::vector constants(4); + constants[0].i = M; + constants[1].i = K; + constants[2].i = A.dims; + constants[3].i = A.dims == 3 ? A.cstep : A.dims == 2 ? A.w : transA ? M : K; + + VkMat dispatcher; + dispatcher.w = M; + dispatcher.h = 1; + dispatcher.c = 1; + + cmd.record_pipeline(pipeline_gemm_quantize_A_int8, bindings, constants, dispatcher); + } + + VkMat B_int8 = B; + VkMat B_int8_descale = B_data_int8_descales_gpu; + if (!constantB) + { + const int size = N * K; + const int blocks = (size + 1023) / 1024; + + B_int8.create(K, N, (size_t)1u, 1, opt.workspace_vkallocator); + if (B_int8.empty()) + return -100; + + B_int8_descale.create(1, (size_t)4u, 1, opt.workspace_vkallocator); + if (B_int8_descale.empty()) + return -100; + + VkMat B_absmax; + B_absmax.create(blocks, (size_t)4u, 1, opt.workspace_vkallocator); + if (B_absmax.empty()) + return -100; + + { + std::vector bindings(2); + bindings[0] = B; + bindings[1] = B_absmax; + + std::vector constants(5); + constants[0].i = N; + constants[1].i = K; + constants[2].i = B.dims; + constants[3].i = B.dims == 3 ? B.cstep : B.dims == 2 ? B.w : transB ? K : N; + constants[4].i = size; + + VkMat dispatcher; + dispatcher.w = blocks * 128; + dispatcher.h = 1; + dispatcher.c = 1; + + cmd.record_pipeline(pipeline_gemm_quantize_B_absmax_int8, bindings, constants, dispatcher); + } + + { + std::vector bindings(2); + bindings[0] = B_absmax; + bindings[1] = B_int8_descale; + + std::vector constants(1); + constants[0].i = blocks; + + VkMat dispatcher; + dispatcher.w = 1; + dispatcher.h = 1; + dispatcher.c = 1; + + cmd.record_pipeline(pipeline_gemm_quantize_B_descale_int8, bindings, constants, dispatcher); + } + + std::vector bindings(4); + bindings[0] = B; + bindings[1] = B_int8; + bindings[2] = B_int8_descale; + bindings[3] = B_int8; + + std::vector constants(5); + constants[0].i = N; + constants[1].i = K; + constants[2].i = B.dims; + constants[3].i = B.dims == 3 ? B.cstep : B.dims == 2 ? B.w : transB ? K : N; + constants[4].i = size; + + VkMat dispatcher; + dispatcher.w = (size + 3) / 4; + dispatcher.h = 1; + dispatcher.c = 1; + + cmd.record_pipeline(pipeline_gemm_quantize_B_int8, bindings, constants, dispatcher); + } + + VkMat& top_blob = top_blobs[0]; + if (output_transpose) + { + if (output_N1M) + top_blob.create(M, 1, N / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + else + top_blob.create(M, N / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + } + else + { + if (output_N1M) + top_blob.create(N, 1, M / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + else + top_blob.create(N, M / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + } + if (top_blob.empty()) + return -100; + + std::vector bindings(use_cooperative_matrix ? 7 : 6); + bindings[0] = top_blob; + bindings[1] = A_int8; + bindings[2] = B_int8; + bindings[3] = C; + bindings[4] = A_int8_descales; + bindings[5] = B_int8_descale; + if (use_cooperative_matrix) + { + bindings[6] = top_blob; + } + + std::vector constants(use_cooperative_matrix ? 6 : 5); + constants[0].u32 = M; + constants[1].u32 = N; + constants[2].u32 = K; + constants[3].i = broadcast_type_C; + constants[4].u32 = top_blob.dims == 3 ? top_blob.cstep : top_blob.w; + if (use_cooperative_matrix) + { + constants[5].u32 = out_elempack; + } + + VkMat dispatcher; + if (use_cooperative_matrix) + { + const int blocks_x = (M + coopmat_M * UNROLL_SG_M * UNROLL_WG_M - 1) / (coopmat_M * UNROLL_SG_M * UNROLL_WG_M); + const int blocks_y = (N + coopmat_N * UNROLL_SG_N * UNROLL_WG_N - 1) / (coopmat_N * UNROLL_SG_N * UNROLL_WG_N); + + dispatcher.w = (blocks_x * blocks_y) * (coopmat_subgroup_size * UNROLL_WG_M * UNROLL_WG_N); + dispatcher.h = 1; + dispatcher.c = 1; + } + else + { + dispatcher.w = (N + 3) / 4; + dispatcher.h = (M + 3) / 4; + dispatcher.c = 1; + } + + cmd.record_pipeline(pipeline_gemm, bindings, constants, dispatcher); + + return 0; +} +#endif // NCNN_INT8 + } // namespace ncnn diff --git a/src/layer/vulkan/gemm_vulkan.h b/src/layer/vulkan/gemm_vulkan.h index 9b275ca240e1..c483f543ab98 100644 --- a/src/layer/vulkan/gemm_vulkan.h +++ b/src/layer/vulkan/gemm_vulkan.h @@ -13,8 +13,6 @@ class Gemm_vulkan : public Gemm public: Gemm_vulkan(); - virtual int load_param(const ParamDict& pd); - virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); @@ -24,6 +22,13 @@ class Gemm_vulkan : public Gemm virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; +protected: +#if NCNN_INT8 + int create_pipeline_int8(const Option& opt); + int upload_model_int8(VkTransfer& cmd, const Option& opt); + int forward_int8(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; +#endif + public: Mat A_data_packed; Mat B_data_packed; @@ -49,6 +54,21 @@ class Gemm_vulkan : public Gemm int UNROLL_SG_K; int UNROLL_WG_M; int UNROLL_WG_N; + +#if NCNN_INT8 + Mat A_data_int8_packed; + Mat B_data_int8_packed; + Mat A_data_int8_descales; + Mat B_data_int8_descales; + + VkMat A_data_int8_descales_gpu; + VkMat B_data_int8_descales_gpu; + + Pipeline* pipeline_gemm_quantize_A_int8; + Pipeline* pipeline_gemm_quantize_B_absmax_int8; + Pipeline* pipeline_gemm_quantize_B_descale_int8; + Pipeline* pipeline_gemm_quantize_B_int8; +#endif }; } // namespace ncnn diff --git a/src/layer/vulkan/innerproduct_vulkan.cpp b/src/layer/vulkan/innerproduct_vulkan.cpp index f146ea2c5e1f..13a3f04496ff 100644 --- a/src/layer/vulkan/innerproduct_vulkan.cpp +++ b/src/layer/vulkan/innerproduct_vulkan.cpp @@ -5,6 +5,7 @@ #include "layer_shader_type.h" #include "layer_type.h" +#include "modelbin.h" namespace ncnn { @@ -16,15 +17,29 @@ InnerProduct_vulkan::InnerProduct_vulkan() flatten = 0; pipeline_innerproduct = 0; - pipeline_innerproduct_sum8 = 0; pipeline_innerproduct_reduce_sum8 = 0; - pipeline_innerproduct_gemm = 0; + +#if NCNN_INT8 + quantize = 0; + + pipeline_innerproduct_int8 = 0; + pipeline_innerproduct_sum8_int8 = 0; + pipeline_innerproduct_reduce_sum8_int8 = 0; + pipeline_innerproduct_gemm_int8 = 0; +#endif } int InnerProduct_vulkan::create_pipeline(const Option& opt) { +#if NCNN_INT8 + if (int8_scale_term && weight_data.elemsize == (size_t)1u) + { + return create_pipeline_int8(opt); + } +#endif + const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0]; const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0]; @@ -279,7 +294,6 @@ int InnerProduct_vulkan::destroy_pipeline(const Option& opt) delete flatten; flatten = 0; } - delete pipeline_innerproduct; pipeline_innerproduct = 0; @@ -291,11 +305,38 @@ int InnerProduct_vulkan::destroy_pipeline(const Option& opt) delete pipeline_innerproduct_gemm; pipeline_innerproduct_gemm = 0; +#if NCNN_INT8 + if (quantize) + { + quantize->destroy_pipeline(opt); + delete quantize; + quantize = 0; + } + + delete pipeline_innerproduct_int8; + pipeline_innerproduct_int8 = 0; + + delete pipeline_innerproduct_sum8_int8; + delete pipeline_innerproduct_reduce_sum8_int8; + pipeline_innerproduct_sum8_int8 = 0; + pipeline_innerproduct_reduce_sum8_int8 = 0; + + delete pipeline_innerproduct_gemm_int8; + pipeline_innerproduct_gemm_int8 = 0; +#endif + return 0; } int InnerProduct_vulkan::upload_model(VkTransfer& cmd, const Option& opt) { +#if NCNN_INT8 + if (int8_scale_term && weight_data_int8_packed.elembits() == 8) + { + return upload_model_int8(cmd, opt); + } +#endif + cmd.record_upload(weight_data_packed, weight_data_gpu, opt); weight_data_packed.release(); @@ -312,6 +353,13 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd, const Option& opt) int InnerProduct_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const { +#if NCNN_INT8 + if (int8_scale_term) + { + return forward_int8(bottom_blob, top_blob, cmd, opt); + } +#endif + const int num_input = weight_data_size / num_output; int in_elempack = num_input % 4 == 0 ? 4 : 1; @@ -463,4 +511,506 @@ int InnerProduct_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCo return 0; } +#if NCNN_INT8 +int InnerProduct_vulkan::create_pipeline_int8(const Option& opt) +{ + const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0]; + const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0]; + + const int num_input = weight_data_size / num_output; + + const int num_input_packed = (num_input + 7) / 8 * 8; + const int num_output_packed = (num_output + 3) / 4 * 4; + + { + Mat weight_data_r2 = weight_data.reshape(num_input, num_output); + + weight_data_int8_packed.create(num_input_packed / 4, num_output_packed / 4, (size_t)16u, 16); + weight_data_int8_packed.fill(0); + + for (int q = 0; q < num_output_packed; q += 4) + { + signed char* g00 = weight_data_int8_packed.row(q / 4); + + for (int p = 0; p < num_input_packed; p += 4) + { + for (int i = 0; i < 4; i++) + { + const signed char* k0 = q + i < num_output && p < num_input ? weight_data_r2.row(q + i) + p : 0; + + for (int j = 0; j < 4; j++) + { + g00[0] = k0 && p + j < num_input ? k0[j] : 0; + g00++; + } + } + } + } + } + + { + const float bottom_blob_int8_scale = bottom_blob_int8_scales.empty() ? 1.f : bottom_blob_int8_scales[0]; + const float bottom_blob_int8_descale = bottom_blob_int8_scale == 0.f ? 0.f : 1.f / bottom_blob_int8_scale; + + weight_data_int8_descales.create(num_output_packed / 4, (size_t)4u * 4, 4); + if (weight_data_int8_descales.empty()) + return -100; + weight_data_int8_descales.fill(0.f); + + float* outptr = weight_data_int8_descales; + for (int q = 0; q < num_output; q++) + { + float scale = weight_data_int8_scales[q]; + outptr[q] = scale == 0.f ? 0.f : bottom_blob_int8_descale / scale; + } + } + + if (bias_term) + { + bias_data_int8_packed.create(num_output_packed / 4, (size_t)4u * 4, 4); + if (bias_data_int8_packed.empty()) + return -100; + bias_data_int8_packed.fill(0.f); + + float* outptr = bias_data_int8_packed; + for (int q = 0; q < num_output; q++) + { + outptr[q] = bias_data[q]; + } + } + + Option opt_int8 = opt; + opt_int8.use_fp16_arithmetic = false; + opt_int8.use_int16_packed = false; + opt_int8.use_int16_storage = false; + + { + quantize = ncnn::create_layer_vulkan(ncnn::LayerType::Quantize); + quantize->vkdev = vkdev; + + Mat shape_quantize; + Mat out_shape_quantize; + if (shape.dims == 2 && shape.w == num_input) + { + const size_t elemsize = shape.elempack == 0 ? (size_t)4u : shape.elemsize / shape.elempack; + shape_quantize = Mat(shape.w, shape.h, (void*)0, elemsize * shape.elempack, shape.elempack); + out_shape_quantize = Mat(shape.w, shape.h, (void*)0, (size_t)shape.elempack, shape.elempack); + } + else if (shape.dims != 0) + { + const int total = shape.w * shape.h * shape.d * shape.c * shape.elempack; + const int flatten_elempack = total % 4 == 0 ? 4 : 1; + const size_t elemsize = shape.elempack == 0 ? (size_t)4u : shape.elemsize / shape.elempack; + shape_quantize = Mat(total / flatten_elempack, (void*)0, elemsize * flatten_elempack, flatten_elempack); + out_shape_quantize = Mat(total / flatten_elempack, (void*)0, (size_t)flatten_elempack, flatten_elempack); + } + + quantize->bottom_shapes.resize(1); + quantize->bottom_shapes[0] = shape_quantize; + quantize->top_shapes.resize(1); + quantize->top_shapes[0] = out_shape_quantize; + + ncnn::ParamDict pd; + pd.set(0, 1); + quantize->load_param(pd); + + Mat weights[1]; + weights[0] = bottom_blob_int8_scales; + quantize->load_model(ModelBinFromMatArray(weights)); + + Option opt_quantize = opt; + opt_quantize.use_fp16_arithmetic = false; + + quantize->create_pipeline(opt_quantize); + } + + if (shape.dims == 2 && shape.w == num_input) + { + // gemm + Mat shape_unpacked(num_input, shape.h * shape.elempack, (void*)0); + Mat out_shape_unpacked(num_output, out_shape.dims == 0 ? 0 : out_shape.h * out_shape.elempack, (void*)0); + + std::vector specializations(6 + 10); + specializations[0].i = bias_term; + specializations[1].i = activation_type; + specializations[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f; + specializations[3].f = activation_params.w == 2 ? activation_params[1] : 0.f; + specializations[4].i = shape.elempack; + specializations[5].i = num_input_packed / 4; + specializations[6 + 0].i = shape_unpacked.dims; + specializations[6 + 1].i = shape_unpacked.w; + specializations[6 + 2].i = shape_unpacked.h; + specializations[6 + 3].i = shape.elempack; + specializations[6 + 4].i = shape.w; + specializations[6 + 5].i = out_shape_unpacked.dims; + specializations[6 + 6].i = out_shape_unpacked.w; + specializations[6 + 7].i = out_shape_unpacked.h; + specializations[6 + 8].i = out_shape_unpacked.c; + specializations[6 + 9].i = out_shape.w; + + Mat local_size_xyz(std::min(16, (num_output + 3) / 4), 4, 1, (void*)0); + if (out_shape_unpacked.dims != 0) + { + local_size_xyz.w = std::min(16, (out_shape_unpacked.w + 3) / 4); + local_size_xyz.h = std::min(4, out_shape_unpacked.h); + local_size_xyz.c = 1; + } + + pipeline_innerproduct_gemm_int8 = new Pipeline(vkdev); + if (opt.use_shader_local_memory) + { + pipeline_innerproduct_gemm_int8->set_local_size_xyz(8, 8, 1); + } + else + { + pipeline_innerproduct_gemm_int8->set_optimal_local_size_xyz(local_size_xyz); + } + pipeline_innerproduct_gemm_int8->create(LayerShaderType::innerproduct_gemm_int8, opt_int8, specializations); + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; + } + + size_t elemsize; + if (opt.use_fp16_storage || opt.use_fp16_packed || opt.use_bf16_storage || opt.use_bf16_packed) + { + elemsize = 2u; + } + else + { + elemsize = 4u; + } + + Mat shape_flatten; + if (shape.dims != 0) + { + const int total = shape.w * shape.h * shape.d * shape.c * shape.elempack; + const int flatten_elempack = total % 4 == 0 ? 4 : 1; + shape_flatten = Mat(total / flatten_elempack, (void*)0, elemsize * flatten_elempack, flatten_elempack); + } + + { + flatten = ncnn::create_layer_vulkan(ncnn::LayerType::Flatten); + flatten->vkdev = vkdev; + + flatten->bottom_shapes.resize(1); + flatten->bottom_shapes[0] = shape; + flatten->top_shapes.resize(1); + flatten->top_shapes[0] = shape_flatten; + + ncnn::ParamDict pd; + + flatten->load_param(pd); + + flatten->create_pipeline(opt); + } + + if (num_input_packed / 4 >= 32) + { + const int outw_sum8 = (num_input_packed / 4 + 7) / 8; + const int outh_sum8 = num_output_packed / 4; + + // sum8 + { + std::vector specializations(1 + 3); + specializations[0].i = num_input_packed / 4; + specializations[1 + 0].i = shape_flatten.w * shape_flatten.elempack; + specializations[1 + 1].i = outw_sum8; + specializations[1 + 2].i = outh_sum8; + + pipeline_innerproduct_sum8_int8 = new Pipeline(vkdev); + pipeline_innerproduct_sum8_int8->set_local_size_xyz(8, std::min(8, outh_sum8), 1); + pipeline_innerproduct_sum8_int8->create(LayerShaderType::innerproduct_sum8_int8, opt_int8, specializations); + } + + // reduce sum8 + { + std::vector specializations(4 + 3); + specializations[0].i = bias_term; + specializations[1].i = activation_type; + specializations[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f; + specializations[3].f = activation_params.w == 2 ? activation_params[1] : 0.f; + specializations[4 + 0].i = outw_sum8; + specializations[4 + 1].i = outh_sum8; + specializations[4 + 2].i = (num_output + 3) / 4; + + pipeline_innerproduct_reduce_sum8_int8 = new Pipeline(vkdev); + pipeline_innerproduct_reduce_sum8_int8->set_local_size_xyz(std::min(64, (num_output + 3) / 4), 1, 1); + pipeline_innerproduct_reduce_sum8_int8->create(LayerShaderType::innerproduct_reduce_sum8_int8, opt_int8, specializations); + } + } + else + { + std::vector specializations(5 + 2); + specializations[0].i = bias_term; + specializations[1].i = activation_type; + specializations[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f; + specializations[3].f = activation_params.w == 2 ? activation_params[1] : 0.f; + specializations[4].i = num_input_packed / 4; + specializations[5 + 0].i = shape_flatten.w * shape_flatten.elempack; + specializations[5 + 1].i = num_output; + + Mat local_size_xyz(std::min(64, (num_output + 3) / 4), 1, 1, (void*)0); + if (out_shape.dims != 0) + { + local_size_xyz.w = std::min(64, (num_output + 3) / 4); + local_size_xyz.h = 1; + local_size_xyz.c = 1; + } + + pipeline_innerproduct_int8 = new Pipeline(vkdev); + pipeline_innerproduct_int8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_innerproduct_int8->create(LayerShaderType::innerproduct_int8, opt_int8, specializations); + } + + // gemm for no shape hint + if (shape.dims == 0) + { + std::vector specializations(6 + 10); + specializations[0].i = bias_term; + specializations[1].i = activation_type; + specializations[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f; + specializations[3].f = activation_params.w == 2 ? activation_params[1] : 0.f; + specializations[4].i = 0; + specializations[5].i = num_input_packed / 4; + specializations[6 + 0].i = 0; + specializations[6 + 1].i = 0; + specializations[6 + 2].i = 0; + specializations[6 + 3].i = 0; + specializations[6 + 4].i = 0; + specializations[6 + 5].i = 0; + specializations[6 + 6].i = 0; + specializations[6 + 7].i = 0; + specializations[6 + 8].i = 0; + specializations[6 + 9].i = 0; + + Mat local_size_xyz(std::min(16, (num_output + 3) / 4), 4, 1, (void*)0); + + pipeline_innerproduct_gemm_int8 = new Pipeline(vkdev); + if (opt.use_shader_local_memory) + { + pipeline_innerproduct_gemm_int8->set_local_size_xyz(8, 8, 1); + } + else + { + pipeline_innerproduct_gemm_int8->set_optimal_local_size_xyz(local_size_xyz); + } + pipeline_innerproduct_gemm_int8->create(LayerShaderType::innerproduct_gemm_int8, opt_int8, specializations); + } + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; +} + +int InnerProduct_vulkan::upload_model_int8(VkTransfer& cmd, const Option& opt) +{ + cmd.record_upload(weight_data_int8_packed, weight_data_gpu, opt); + + weight_data_int8_packed.release(); + + cmd.record_upload(weight_data_int8_descales, weight_data_int8_descales_gpu, opt); + + weight_data_int8_descales.release(); + weight_data_int8_scales.release(); + + if (bias_term) + { + cmd.record_upload(bias_data_int8_packed, bias_data_gpu, opt); + + bias_data_int8_packed.release(); + bias_data.release(); + } + + quantize->upload_model(cmd, opt); + + return 0; +} + +int InnerProduct_vulkan::forward_int8(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const +{ + const int num_input = weight_data_size / num_output; + const int out_elempack = num_output % 4 == 0 ? 4 : 1; + + if (bottom_blob.dims == 2 && bottom_blob.w == num_input) + { + // gemm + VkMat bottom_blob_quantized = bottom_blob; + + if (bottom_blob_quantized.elembits() != 8) + { + Option opt_quantize = opt; + opt_quantize.blob_vkallocator = opt.workspace_vkallocator; + opt_quantize.use_fp16_arithmetic = false; + + VkMat bottom_blob_int8; + int ret = quantize->forward(bottom_blob_quantized, bottom_blob_int8, cmd, opt_quantize); + if (ret != 0) + return ret; + + bottom_blob_quantized = bottom_blob_int8; + } + + const int h = bottom_blob_quantized.h; + const int elempack = bottom_blob_quantized.elempack; + const int outh = h * elempack; + size_t out_elemsize; + if (opt.use_fp16_storage || opt.use_fp16_packed || opt.use_bf16_storage || opt.use_bf16_packed) + { + out_elemsize = elempack * 2u; + } + else + { + out_elemsize = elempack * 4u; + } + + top_blob.create(num_output, h, out_elemsize, elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + std::vector bindings(7); + bindings[0] = bottom_blob_quantized; + bindings[1] = top_blob; + bindings[2] = weight_data_gpu; + bindings[3] = weight_data_int8_descales_gpu; + bindings[4] = bias_data_gpu; + bindings[5] = top_blob; + bindings[6] = bottom_blob_quantized; + + std::vector constants(10); + constants[0].i = bottom_blob_quantized.dims; + constants[1].i = num_input; + constants[2].i = outh; + constants[3].i = elempack; + constants[4].i = bottom_blob_quantized.w; + constants[5].i = top_blob.dims; + constants[6].i = num_output; + constants[7].i = outh; + constants[8].i = top_blob.c; + constants[9].i = top_blob.cstep; + + VkMat dispatcher; + dispatcher.w = (num_output + 3) / 4; + dispatcher.h = (outh + 3) / 4; + dispatcher.c = 1; + + cmd.record_pipeline(pipeline_innerproduct_gemm_int8, bindings, constants, dispatcher); + + return 0; + } + + // flatten + VkMat bottom_blob_flattened = bottom_blob; + { + Option opt_flatten = opt; + opt_flatten.blob_vkallocator = opt.workspace_vkallocator; + + flatten->forward(bottom_blob, bottom_blob_flattened, cmd, opt_flatten); + } + + if (bottom_blob_flattened.elembits() != 8) + { + Option opt_quantize = opt; + opt_quantize.blob_vkallocator = opt.workspace_vkallocator; + opt_quantize.use_fp16_arithmetic = false; + + VkMat bottom_blob_int8; + int ret = quantize->forward(bottom_blob_flattened, bottom_blob_int8, cmd, opt_quantize); + if (ret != 0) + return ret; + + bottom_blob_flattened = bottom_blob_int8; + } + + size_t out_elemsize; + if (opt.use_fp16_storage || opt.use_fp16_packed || opt.use_bf16_storage || opt.use_bf16_packed) + { + out_elemsize = out_elempack * 2u; + } + else + { + out_elemsize = out_elempack * 4u; + } + + const int num_input_packed = (num_input + 7) / 8 * 8; + const int num_output_packed = (num_output + 3) / 4 * 4; + + if (num_input_packed / 4 >= 32) + { + // sum8 + VkMat top_blob_sum8; + { + top_blob_sum8.create((num_input_packed / 4 + 7) / 8, num_output_packed / 4, (size_t)4u * 4, 4, opt.blob_vkallocator); + if (top_blob_sum8.empty()) + return -100; + + std::vector bindings(3); + bindings[0] = bottom_blob_flattened; + bindings[1] = top_blob_sum8; + bindings[2] = weight_data_gpu; + + std::vector constants(3); + constants[0].i = bottom_blob_flattened.w * bottom_blob_flattened.elempack; + constants[1].i = top_blob_sum8.w; + constants[2].i = top_blob_sum8.h; + + cmd.record_pipeline(pipeline_innerproduct_sum8_int8, bindings, constants, top_blob_sum8); + } + + // reduce sum8 + { + top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + std::vector bindings(4); + bindings[0] = top_blob_sum8; + bindings[1] = top_blob; + bindings[2] = weight_data_int8_descales_gpu; + bindings[3] = bias_data_gpu; + + std::vector constants(3); + constants[0].i = top_blob_sum8.w; + constants[1].i = top_blob_sum8.h; + constants[2].i = (num_output + 3) / 4; + + cmd.record_pipeline(pipeline_innerproduct_reduce_sum8_int8, bindings, constants, top_blob); + } + + return 0; + } + + top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + std::vector bindings(5); + bindings[0] = bottom_blob_flattened; + bindings[1] = top_blob; + bindings[2] = weight_data_gpu; + bindings[3] = weight_data_int8_descales_gpu; + bindings[4] = bias_data_gpu; + + std::vector constants(2); + constants[0].i = bottom_blob_flattened.w * bottom_blob_flattened.elempack; + constants[1].i = num_output; + + VkMat dispatcher; + dispatcher.w = (num_output + 3) / 4; + dispatcher.h = 1; + dispatcher.c = 1; + + cmd.record_pipeline(pipeline_innerproduct_int8, bindings, constants, dispatcher); + + return 0; +} +#endif // NCNN_INT8 + } // namespace ncnn diff --git a/src/layer/vulkan/innerproduct_vulkan.h b/src/layer/vulkan/innerproduct_vulkan.h index 8495b2296167..63f6fe4beeb1 100644 --- a/src/layer/vulkan/innerproduct_vulkan.h +++ b/src/layer/vulkan/innerproduct_vulkan.h @@ -21,6 +21,13 @@ class InnerProduct_vulkan : public InnerProduct using InnerProduct::forward; virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; +protected: +#if NCNN_INT8 + int create_pipeline_int8(const Option& opt); + int upload_model_int8(VkTransfer& cmd, const Option& opt); + int forward_int8(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; +#endif + public: ncnn::Layer* flatten; @@ -35,6 +42,21 @@ class InnerProduct_vulkan : public InnerProduct Pipeline* pipeline_innerproduct_reduce_sum8; Pipeline* pipeline_innerproduct_gemm; + +#if NCNN_INT8 + ncnn::Layer* quantize; + + Mat weight_data_int8_packed; + Mat weight_data_int8_descales; + Mat bias_data_int8_packed; + + VkMat weight_data_int8_descales_gpu; + + Pipeline* pipeline_innerproduct_int8; + Pipeline* pipeline_innerproduct_sum8_int8; + Pipeline* pipeline_innerproduct_reduce_sum8_int8; + Pipeline* pipeline_innerproduct_gemm_int8; +#endif }; } // namespace ncnn diff --git a/src/layer/vulkan/padding_vulkan.cpp b/src/layer/vulkan/padding_vulkan.cpp index 7e2120ca59f4..ea1242d04b93 100644 --- a/src/layer/vulkan/padding_vulkan.cpp +++ b/src/layer/vulkan/padding_vulkan.cpp @@ -19,6 +19,16 @@ Padding_vulkan::Padding_vulkan() pipeline_padding_3d = 0; pipeline_padding_3d_pack4 = 0; + +#if NCNN_INT8 + pipeline_padding_int8 = 0; + pipeline_padding_pack4_int8 = 0; + pipeline_padding_pack1to4_int8 = 0; + pipeline_padding_pack4to1_int8 = 0; + + pipeline_padding_3d_int8 = 0; + pipeline_padding_3d_pack4_int8 = 0; +#endif // NCNN_INT8 } int Padding_vulkan::create_pipeline(const Option& opt) @@ -26,6 +36,25 @@ int Padding_vulkan::create_pipeline(const Option& opt) const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0]; const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0]; +#if NCNN_INT8 + if (per_channel_pad_data_size) + { + per_channel_pad_data_int8.create((per_channel_pad_data_size + 3) / 4 * 4, (size_t)1u, 1); + if (per_channel_pad_data_int8.empty()) + return -100; + + signed char* outptr = per_channel_pad_data_int8; + for (int i = 0; i < per_channel_pad_data_int8.w; i++) + { + outptr[i] = 0; + } + for (int i = 0; i < per_channel_pad_data_size; i++) + { + outptr[i] = static_cast((float)per_channel_pad_data[i]); + } + } +#endif // NCNN_INT8 + int offset_elempack = 1; if (shape.dims == 1) { @@ -125,24 +154,30 @@ int Padding_vulkan::create_pipeline(const Option& opt) } // pack1 - if (out_shape.dims == 0 || (offset_elempack == 1 && out_shape.elempack == 1)) + if (out_shape.dims == 0 || (out_shape.dims != 4 && offset_elempack == 1 && out_shape.elempack == 1)) { pipeline_padding = new Pipeline(vkdev); pipeline_padding->set_optimal_local_size_xyz(local_size_xyz); pipeline_padding->create(LayerShaderType::padding, opt, specializations); + } + if (out_shape.dims == 0 || (out_shape.dims == 4 && offset_elempack == 1 && out_shape.elempack == 1)) + { pipeline_padding_3d = new Pipeline(vkdev); pipeline_padding_3d->set_optimal_local_size_xyz(local_size_xyz); pipeline_padding_3d->create(LayerShaderType::padding_3d, opt, specializations_3d); } // pack4 - if (out_shape.dims == 0 || (offset_elempack == 4 && out_shape.elempack == 4)) + if (out_shape.dims == 0 || (out_shape.dims != 4 && offset_elempack == 4 && out_shape.elempack == 4)) { pipeline_padding_pack4 = new Pipeline(vkdev); pipeline_padding_pack4->set_optimal_local_size_xyz(local_size_xyz); pipeline_padding_pack4->create(LayerShaderType::padding_pack4, opt, specializations); + } + if (out_shape.dims == 0 || (out_shape.dims == 4 && offset_elempack == 4 && out_shape.elempack == 4)) + { pipeline_padding_3d_pack4 = new Pipeline(vkdev); pipeline_padding_3d_pack4->set_optimal_local_size_xyz(local_size_xyz); pipeline_padding_3d_pack4->create(LayerShaderType::padding_3d_pack4, opt, specializations_3d); @@ -164,6 +199,13 @@ int Padding_vulkan::create_pipeline(const Option& opt) pipeline_padding_pack4to1->create(LayerShaderType::padding_pack4to1, opt, specializations); } +#if NCNN_INT8 + if (opt.use_int8_packed || opt.use_int8_storage) + { + return create_pipeline_int8(opt); + } +#endif // NCNN_INT8 + return 0; } @@ -187,6 +229,26 @@ int Padding_vulkan::destroy_pipeline(const Option& /*opt*/) delete pipeline_padding_3d_pack4; pipeline_padding_3d_pack4 = 0; +#if NCNN_INT8 + delete pipeline_padding_int8; + pipeline_padding_int8 = 0; + + delete pipeline_padding_pack4_int8; + pipeline_padding_pack4_int8 = 0; + + delete pipeline_padding_pack1to4_int8; + pipeline_padding_pack1to4_int8 = 0; + + delete pipeline_padding_pack4to1_int8; + pipeline_padding_pack4to1_int8 = 0; + + delete pipeline_padding_3d_int8; + pipeline_padding_3d_int8 = 0; + + delete pipeline_padding_3d_pack4_int8; + pipeline_padding_3d_pack4_int8 = 0; +#endif // NCNN_INT8 + return 0; } @@ -197,8 +259,15 @@ int Padding_vulkan::upload_model(VkTransfer& cmd, const Option& opt) cmd.record_upload(per_channel_pad_data, per_channel_pad_data_gpu, opt); +#if NCNN_INT8 + cmd.record_upload(per_channel_pad_data_int8, per_channel_pad_data_int8_gpu, opt); +#endif // NCNN_INT8 + if (opt.lightmode) { +#if NCNN_INT8 + per_channel_pad_data_int8.release(); +#endif // NCNN_INT8 per_channel_pad_data.release(); } @@ -207,6 +276,11 @@ int Padding_vulkan::upload_model(VkTransfer& cmd, const Option& opt) int Padding_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const { +#if NCNN_INT8 + if (bottom_blob.elembits() == 8) + return forward_int8(bottom_blob, top_blob, cmd, opt); +#endif // NCNN_INT8 + int dims = bottom_blob.dims; int w = bottom_blob.w; int h = bottom_blob.h; @@ -383,6 +457,12 @@ int Padding_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute int Padding_vulkan::forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const { const VkMat& bottom_blob = bottom_blobs[0]; + +#if NCNN_INT8 + if (bottom_blob.elembits() == 8) + return forward_int8(bottom_blobs, top_blobs, cmd, opt); +#endif // NCNN_INT8 + const VkMat& reference_blob = bottom_blobs[1]; VkMat& top_blob = top_blobs[0]; @@ -576,4 +656,541 @@ int Padding_vulkan::forward(const std::vector& bottom_blobs, std::vector< return 0; } +#if NCNN_INT8 +int Padding_vulkan::create_pipeline_int8(const Option& opt) +{ + const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0]; + const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0]; + + Mat shape_int8; + if (shape.dims == 1) shape_int8 = Mat(shape.w, (void*)0, (size_t)shape.elempack, shape.elempack); + if (shape.dims == 2) shape_int8 = Mat(shape.w, shape.h, (void*)0, (size_t)shape.elempack, shape.elempack); + if (shape.dims == 3) shape_int8 = Mat(shape.w, shape.h, shape.c, (void*)0, (size_t)shape.elempack, shape.elempack); + if (shape.dims == 4) shape_int8 = Mat(shape.w, shape.h, shape.d, shape.c, (void*)0, (size_t)shape.elempack, shape.elempack); + + Mat out_shape_int8; + if (out_shape.dims == 1) out_shape_int8 = Mat(out_shape.w, (void*)0, (size_t)out_shape.elempack, out_shape.elempack); + if (out_shape.dims == 2) out_shape_int8 = Mat(out_shape.w, out_shape.h, (void*)0, (size_t)out_shape.elempack, out_shape.elempack); + if (out_shape.dims == 3) out_shape_int8 = Mat(out_shape.w, out_shape.h, out_shape.c, (void*)0, (size_t)out_shape.elempack, out_shape.elempack); + if (out_shape.dims == 4) out_shape_int8 = Mat(out_shape.w, out_shape.h, out_shape.d, out_shape.c, (void*)0, (size_t)out_shape.elempack, out_shape.elempack); + + int offset_elempack = 1; + if (shape_int8.dims == 1) + { + if (left == 0) + offset_elempack = shape_int8.elempack; + else + offset_elempack = left % 4 == 0 ? 4 : 1; + } + else if (shape_int8.dims == 2) + { + if (top == 0) + offset_elempack = shape_int8.elempack; + else + offset_elempack = top % 4 == 0 ? 4 : 1; + } + else if (shape_int8.dims == 3) + { + if (front == 0) + offset_elempack = shape_int8.elempack; + else + offset_elempack = front % 4 == 0 ? 4 : 1; + } + else // if (shape_int8.dims == 4) + { + offset_elempack = shape_int8.elempack; + } + + offset_elempack = std::min(offset_elempack, shape_int8.elempack); + + Mat shape_unpacked = shape_int8; + if (one_blob_only && shape_int8.dims != 0 && shape_int8.elempack > offset_elempack) + { + size_t offset_elemsize = shape_int8.elemsize / shape_int8.elempack * offset_elempack; + + if (shape_int8.dims == 1) shape_unpacked = Mat(shape_int8.w * shape_int8.elempack / offset_elempack, (void*)0, offset_elemsize, offset_elempack); + if (shape_int8.dims == 2) shape_unpacked = Mat(shape_int8.w, shape_int8.h * shape_int8.elempack / offset_elempack, (void*)0, offset_elemsize, offset_elempack); + if (shape_int8.dims == 3) shape_unpacked = Mat(shape_int8.w, shape_int8.h, shape_int8.c * shape_int8.elempack / offset_elempack, (void*)0, offset_elemsize, offset_elempack); + // if (shape_int8.dims == 4) should never reach here + } + + std::vector specializations(3 + 10); + specializations[0].i = type; + specializations[1].f = value; + specializations[2].i = per_channel_pad_data_size ? 1 : 0; + specializations[3 + 0].i = shape_unpacked.dims; + specializations[3 + 1].i = shape_unpacked.w; + specializations[3 + 2].i = shape_unpacked.h; + specializations[3 + 3].i = shape_unpacked.c; + specializations[3 + 4].i = shape_unpacked.cstep; + specializations[3 + 5].i = out_shape_int8.dims; + specializations[3 + 6].i = out_shape_int8.w; + specializations[3 + 7].i = out_shape_int8.h; + specializations[3 + 8].i = out_shape_int8.c; + specializations[3 + 9].i = out_shape_int8.cstep; + + std::vector specializations_3d(3 + 12); + specializations_3d[0].i = type; + specializations_3d[1].f = value; + specializations_3d[2].i = per_channel_pad_data_size ? 1 : 0; + specializations_3d[3 + 0].i = shape_unpacked.dims; + specializations_3d[3 + 1].i = shape_unpacked.w; + specializations_3d[3 + 2].i = shape_unpacked.h; + specializations_3d[3 + 3].i = shape_unpacked.d; + specializations_3d[3 + 4].i = shape_unpacked.c; + specializations_3d[3 + 5].i = shape_unpacked.cstep; + specializations_3d[3 + 6].i = out_shape_int8.dims; + specializations_3d[3 + 7].i = out_shape_int8.w; + specializations_3d[3 + 8].i = out_shape_int8.h; + specializations_3d[3 + 9].i = out_shape_int8.d; + specializations_3d[3 + 10].i = out_shape_int8.c; + specializations_3d[3 + 11].i = out_shape_int8.cstep; + + Mat local_size_xyz; + if (out_shape_int8.dims == 1) + { + local_size_xyz.w = std::min(64, out_shape_int8.w); + local_size_xyz.h = 1; + local_size_xyz.c = 1; + } + if (out_shape_int8.dims == 2) + { + local_size_xyz.w = std::min(8, out_shape_int8.w); + local_size_xyz.h = std::min(8, out_shape_int8.h); + local_size_xyz.c = 1; + } + if (out_shape_int8.dims == 3) + { + local_size_xyz.w = std::min(4, out_shape_int8.w); + local_size_xyz.h = std::min(4, out_shape_int8.h); + local_size_xyz.c = std::min(4, out_shape_int8.c); + } + if (out_shape_int8.dims == 4) + { + local_size_xyz.w = std::min(4, out_shape_int8.w); + local_size_xyz.h = std::min(4, out_shape_int8.h * out_shape_int8.d); + local_size_xyz.c = std::min(4, out_shape_int8.c); + } + + // pack1 + if (out_shape_int8.dims == 0 || (out_shape_int8.dims != 4 && offset_elempack == 1 && out_shape_int8.elempack == 1)) + { + pipeline_padding_int8 = new Pipeline(vkdev); + pipeline_padding_int8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_padding_int8->create(LayerShaderType::padding_int8, opt, specializations); + } + + if (out_shape_int8.dims == 0 || (out_shape_int8.dims == 4 && offset_elempack == 1 && out_shape_int8.elempack == 1)) + { + pipeline_padding_3d_int8 = new Pipeline(vkdev); + pipeline_padding_3d_int8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_padding_3d_int8->create(LayerShaderType::padding_3d_int8, opt, specializations_3d); + } + + // pack4 + if (out_shape_int8.dims == 0 || (out_shape_int8.dims != 4 && offset_elempack == 4 && out_shape_int8.elempack == 4)) + { + pipeline_padding_pack4_int8 = new Pipeline(vkdev); + pipeline_padding_pack4_int8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_padding_pack4_int8->create(LayerShaderType::padding_pack4_int8, opt, specializations); + } + + if (out_shape_int8.dims == 0 || (out_shape_int8.dims == 4 && offset_elempack == 4 && out_shape_int8.elempack == 4)) + { + pipeline_padding_3d_pack4_int8 = new Pipeline(vkdev); + pipeline_padding_3d_pack4_int8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_padding_3d_pack4_int8->create(LayerShaderType::padding_3d_pack4_int8, opt, specializations_3d); + } + + // pack1to4 + if (out_shape_int8.dims == 0 || (offset_elempack == 1 && out_shape_int8.elempack == 4)) + { + pipeline_padding_pack1to4_int8 = new Pipeline(vkdev); + pipeline_padding_pack1to4_int8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_padding_pack1to4_int8->create(LayerShaderType::padding_pack1to4_int8, opt, specializations); + } + + // pack4to1 + if (out_shape_int8.dims == 0 || (offset_elempack == 4 && out_shape_int8.elempack == 1)) + { + pipeline_padding_pack4to1_int8 = new Pipeline(vkdev); + pipeline_padding_pack4to1_int8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_padding_pack4to1_int8->create(LayerShaderType::padding_pack4to1_int8, opt, specializations); + } + + return 0; +} + +int Padding_vulkan::forward_int8(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const +{ + int dims = bottom_blob.dims; + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = 0; + int outh = 0; + int outd = 0; + int outc = 0; + + int offset_elempack; + int out_elempack; + + if (dims == 1) + { + if (left == 0 && right == 0) + { + top_blob = bottom_blob; + return 0; + } + + outw = w * elempack + left + right; + out_elempack = outw % 4 == 0 ? 4 : 1; + offset_elempack = left == 0 ? elempack : left % 4 == 0 ? 4 : 1; + } + else if (dims == 2) + { + if (top == 0 && bottom == 0 && left == 0 && right == 0) + { + top_blob = bottom_blob; + return 0; + } + + outw = w + left + right; + outh = h * elempack + top + bottom; + out_elempack = outh % 4 == 0 ? 4 : 1; + offset_elempack = top == 0 ? elempack : top % 4 == 0 ? 4 : 1; + } + else if (dims == 3) + { + if (top == 0 && bottom == 0 && left == 0 && right == 0 && front == 0 && behind == 0) + { + top_blob = bottom_blob; + return 0; + } + + outw = w + left + right; + outh = h + top + bottom; + outc = channels * elempack + front + behind; + out_elempack = outc % 4 == 0 ? 4 : 1; + offset_elempack = front == 0 ? elempack : front % 4 == 0 ? 4 : 1; + } + else // if (dims == 4) + { + if (top == 0 && bottom == 0 && left == 0 && right == 0 && front == 0 && behind == 0) + { + top_blob = bottom_blob; + return 0; + } + + outw = w + left + right; + outh = h + top + bottom; + outd = d + front + behind; + outc = channels * elempack; + out_elempack = elempack; + offset_elempack = elempack; + } + + offset_elempack = std::min(offset_elempack, elempack); + + size_t out_elemsize = elemsize / elempack * out_elempack; + + // unpacking + VkMat bottom_blob_unpacked = bottom_blob; + if (elempack > offset_elempack) + { + Option opt_pack1 = opt; + opt_pack1.blob_vkallocator = opt.workspace_vkallocator; + + vkdev->convert_packing(bottom_blob, bottom_blob_unpacked, offset_elempack, cmd, opt_pack1); + } + + if (dims == 1) + { + top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + } + else if (dims == 2) + { + top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + } + else if (dims == 3) + { + top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + } + else // if (dims == 4) + { + top_blob.create(outw, outh, outd, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + } + if (top_blob.empty()) + return -100; + + std::vector bindings(3); + bindings[0] = bottom_blob_unpacked; + bindings[1] = top_blob; + bindings[2] = per_channel_pad_data_int8_gpu; + + if (dims == 4) + { + std::vector constants(15); + constants[0].i = bottom_blob_unpacked.dims; + constants[1].i = bottom_blob_unpacked.w; + constants[2].i = bottom_blob_unpacked.h; + constants[3].i = bottom_blob_unpacked.d; + constants[4].i = bottom_blob_unpacked.c; + constants[5].i = bottom_blob_unpacked.cstep; + constants[6].i = top_blob.dims; + constants[7].i = top_blob.w; + constants[8].i = top_blob.h; + constants[9].i = top_blob.d; + constants[10].i = top_blob.c; + constants[11].i = top_blob.cstep; + constants[12].i = left; + constants[13].i = top; + constants[14].i = front; + + const Pipeline* pipeline = out_elempack == 4 ? pipeline_padding_3d_pack4_int8 : pipeline_padding_3d_int8; + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + + return 0; + } + + std::vector constants(13); + constants[0].i = bottom_blob_unpacked.dims; + constants[1].i = bottom_blob_unpacked.w; + constants[2].i = bottom_blob_unpacked.h; + constants[3].i = bottom_blob_unpacked.c; + constants[4].i = bottom_blob_unpacked.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = top_blob.cstep; + constants[10].i = left; + constants[11].i = top; + constants[12].i = front; + + const Pipeline* pipeline = 0; + if (offset_elempack == 1 && out_elempack == 1) + { + pipeline = pipeline_padding_int8; + } + else if (offset_elempack == 4 && out_elempack == 4) + { + pipeline = pipeline_padding_pack4_int8; + } + else if (offset_elempack == 1 && out_elempack == 4) + { + pipeline = pipeline_padding_pack1to4_int8; + } + else if (offset_elempack == 4 && out_elempack == 1) + { + pipeline = pipeline_padding_pack4to1_int8; + } + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + + return 0; +} + +int Padding_vulkan::forward_int8(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const +{ + const VkMat& bottom_blob = bottom_blobs[0]; + const VkMat& reference_blob = bottom_blobs[1]; + + VkMat& top_blob = top_blobs[0]; + int _top; + int _bottom; + int _left; + int _right; + int _front; + int _behind; + { + const int* param_data = reference_blob.mapped(); + + _top = param_data[0]; + _bottom = param_data[1]; + _left = param_data[2]; + _right = param_data[3]; + _front = param_data[4]; + _behind = param_data[5]; + } + + int dims = bottom_blob.dims; + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = 0; + int outh = 0; + int outd = 0; + int outc = 0; + + int offset_elempack; + int out_elempack; + + if (dims == 1) + { + if (_left == 0 && _right == 0) + { + top_blob = bottom_blob; + return 0; + } + + outw = w * elempack + _left + _right; + out_elempack = outw % 4 == 0 ? 4 : 1; + offset_elempack = _left == 0 ? elempack : _left % 4 == 0 ? 4 : 1; + } + else if (dims == 2) + { + if (_top == 0 && _bottom == 0 && _left == 0 && _right == 0) + { + top_blob = bottom_blob; + return 0; + } + + outw = w + _left + _right; + outh = h * elempack + _top + _bottom; + out_elempack = outh % 4 == 0 ? 4 : 1; + offset_elempack = _top == 0 ? elempack : _top % 4 == 0 ? 4 : 1; + } + else if (dims == 3) + { + if (_top == 0 && _bottom == 0 && _left == 0 && _right == 0 && _front == 0 && _behind == 0) + { + top_blob = bottom_blob; + return 0; + } + + outw = w + _left + _right; + outh = h + _top + _bottom; + outc = channels * elempack + _front + _behind; + out_elempack = outc % 4 == 0 ? 4 : 1; + offset_elempack = _front == 0 ? elempack : _front % 4 == 0 ? 4 : 1; + } + else // if (dims == 4) + { + if (_top == 0 && _bottom == 0 && _left == 0 && _right == 0 && _front == 0 && _behind == 0) + { + top_blob = bottom_blob; + return 0; + } + + outw = w + _left + _right; + outh = h + _top + _bottom; + outd = d + _front + _behind; + outc = channels * elempack; + out_elempack = elempack; + offset_elempack = elempack; + } + + offset_elempack = std::min(offset_elempack, elempack); + + size_t out_elemsize = elemsize / elempack * out_elempack; + + // unpacking + VkMat bottom_blob_unpacked = bottom_blob; + if (elempack > offset_elempack) + { + Option opt_pack1 = opt; + opt_pack1.blob_vkallocator = opt.workspace_vkallocator; + + vkdev->convert_packing(bottom_blob, bottom_blob_unpacked, offset_elempack, cmd, opt_pack1); + } + + if (dims == 1) + { + top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + } + else if (dims == 2) + { + top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + } + else if (dims == 3) + { + top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + } + else // if (dims == 4) + { + top_blob.create(outw, outh, outd, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + } + if (top_blob.empty()) + return -100; + + std::vector bindings(3); + bindings[0] = bottom_blob_unpacked; + bindings[1] = top_blob; + bindings[2] = per_channel_pad_data_int8_gpu; + + if (dims == 4) + { + std::vector constants(15); + constants[0].i = bottom_blob_unpacked.dims; + constants[1].i = bottom_blob_unpacked.w; + constants[2].i = bottom_blob_unpacked.h; + constants[3].i = bottom_blob_unpacked.d; + constants[4].i = bottom_blob_unpacked.c; + constants[5].i = bottom_blob_unpacked.cstep; + constants[6].i = top_blob.dims; + constants[7].i = top_blob.w; + constants[8].i = top_blob.h; + constants[9].i = top_blob.d; + constants[10].i = top_blob.c; + constants[11].i = top_blob.cstep; + constants[12].i = _left; + constants[13].i = _top; + constants[14].i = _front; + + const Pipeline* pipeline = out_elempack == 4 ? pipeline_padding_3d_pack4_int8 : pipeline_padding_3d_int8; + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + + return 0; + } + + std::vector constants(13); + constants[0].i = bottom_blob_unpacked.dims; + constants[1].i = bottom_blob_unpacked.w; + constants[2].i = bottom_blob_unpacked.h; + constants[3].i = bottom_blob_unpacked.c; + constants[4].i = bottom_blob_unpacked.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = top_blob.cstep; + constants[10].i = _left; + constants[11].i = _top; + constants[12].i = _front; + + const Pipeline* pipeline = 0; + if (offset_elempack == 1 && out_elempack == 1) + { + pipeline = pipeline_padding_int8; + } + else if (offset_elempack == 4 && out_elempack == 4) + { + pipeline = pipeline_padding_pack4_int8; + } + else if (offset_elempack == 1 && out_elempack == 4) + { + pipeline = pipeline_padding_pack1to4_int8; + } + else if (offset_elempack == 4 && out_elempack == 1) + { + pipeline = pipeline_padding_pack4to1_int8; + } + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + + return 0; +} +#endif // NCNN_INT8 + } // namespace ncnn diff --git a/src/layer/vulkan/padding_vulkan.h b/src/layer/vulkan/padding_vulkan.h index c2abf6ea96fa..4d73bc037bc1 100644 --- a/src/layer/vulkan/padding_vulkan.h +++ b/src/layer/vulkan/padding_vulkan.h @@ -23,9 +23,21 @@ class Padding_vulkan : public Padding virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; +protected: +#if NCNN_INT8 + int create_pipeline_int8(const Option& opt); + int forward_int8(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; + int forward_int8(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; +#endif // NCNN_INT8 + public: VkMat per_channel_pad_data_gpu; +#if NCNN_INT8 + Mat per_channel_pad_data_int8; + VkMat per_channel_pad_data_int8_gpu; +#endif // NCNN_INT8 + Pipeline* pipeline_padding; Pipeline* pipeline_padding_pack4; Pipeline* pipeline_padding_pack1to4; @@ -33,6 +45,16 @@ class Padding_vulkan : public Padding Pipeline* pipeline_padding_3d; Pipeline* pipeline_padding_3d_pack4; + +#if NCNN_INT8 + Pipeline* pipeline_padding_int8; + Pipeline* pipeline_padding_pack4_int8; + Pipeline* pipeline_padding_pack1to4_int8; + Pipeline* pipeline_padding_pack4to1_int8; + + Pipeline* pipeline_padding_3d_int8; + Pipeline* pipeline_padding_3d_pack4_int8; +#endif // NCNN_INT8 }; } // namespace ncnn diff --git a/src/layer/vulkan/shader/convolution_1x1s1d1_int8_cm.comp b/src/layer/vulkan/shader/convolution_1x1s1d1_int8_cm.comp new file mode 100644 index 000000000000..b7a838f46a52 --- /dev/null +++ b/src/layer/vulkan/shader/convolution_1x1s1d1_int8_cm.comp @@ -0,0 +1,739 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#extension GL_EXT_control_flow_attributes : require + +#extension GL_KHR_shader_subgroup_basic : require + +#extension GL_KHR_memory_scope_semantics : require +#extension GL_EXT_shader_explicit_arithmetic_types : require +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require +#if ncnn_VK_KHR_cooperative_matrix +#extension GL_KHR_cooperative_matrix : require +#elif ncnn_VK_NV_cooperative_matrix +#extension GL_NV_cooperative_matrix : require +#extension GL_NV_integer_cooperative_matrix : require +#endif + +#extension GL_GOOGLE_include_directive : enable +#include "vulkan_activation.comp" + +layout(constant_id = 0) const int bias_term = 0; +layout(constant_id = 1) const int activation_type = 0; +layout(constant_id = 2) const float activation_param_0 = 0; +layout(constant_id = 3) const float activation_param_1 = 0; +layout(constant_id = 4) const int use_int8_requantize = 0; +layout(constant_id = 5) const uint elempack = 1; +layout(constant_id = 6) const uint out_elempack = 1; + +#define shape_constant_id_offset 7 +layout(constant_id = shape_constant_id_offset + 0) const uint cstep = 0; +layout(constant_id = shape_constant_id_offset + 1) const uint outcstep = 0; +layout(constant_id = shape_constant_id_offset + 2) const uint size = 0; +layout(constant_id = shape_constant_id_offset + 3) const uint num_output = 0; +layout(constant_id = shape_constant_id_offset + 4) const uint num_input = 0; + +layout(constant_id = shape_constant_id_offset + 5 + 0) const uint M = 1; +layout(constant_id = shape_constant_id_offset + 5 + 1) const uint N = 1; +layout(constant_id = shape_constant_id_offset + 5 + 2) const uint K = 1; +layout(constant_id = shape_constant_id_offset + 5 + 3) const uint subgroup_size = 32; +layout(constant_id = shape_constant_id_offset + 5 + 4) const uint UNROLL_SG_M = 2; +layout(constant_id = shape_constant_id_offset + 5 + 5) const uint UNROLL_SG_N = 2; +layout(constant_id = shape_constant_id_offset + 5 + 6) const uint UNROLL_SG_K = 2; +layout(constant_id = shape_constant_id_offset + 5 + 7) const uint UNROLL_WG_M = 2; +layout(constant_id = shape_constant_id_offset + 5 + 8) const uint UNROLL_WG_N = 2; + +layout(binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_int8_data[]; }; +layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout(binding = 2) writeonly buffer top_blob_int8 { sint8vec4 top_blob_int8_data[]; }; +layout(binding = 3) readonly buffer weight_blob { sint8vec4 weight_data[]; }; +layout(binding = 4) readonly buffer bias_blob { vec4 bias_data[]; }; +layout(binding = 5) readonly buffer weight_descales_blob { vec4 weight_descales_data[]; }; +layout(binding = 6) readonly buffer top_scales_blob { sfp top_scales_data[]; }; + +layout(push_constant) uniform parameter +{ + uint cstep; + uint outcstep; + uint size; + uint num_output; + uint num_input; +} p; + +const uint Nd4 = N / 4; +const uint Kd4 = K / 4; +const uint Md4 = M / 4; + +#if ncnn_VK_KHR_cooperative_matrix +#define PAD 1 +#elif ncnn_VK_NV_cooperative_matrix +#define PAD 0 +#endif + +const uint Nd4p = Nd4 + PAD; +const uint Kd4p = Kd4 + PAD; + +shared int tmp_v[UNROLL_WG_M][UNROLL_SG_K * UNROLL_SG_M * M * Kd4p]; +shared int tmp_k[UNROLL_WG_N][UNROLL_SG_K * UNROLL_SG_N * K * Nd4p]; +shared ivec4 tmp_o[UNROLL_WG_N * UNROLL_WG_M][UNROLL_SG_N * UNROLL_SG_M * M * N / 4]; + +void main() +{ + const uint wgi = gl_WorkGroupID.x; + const uint sgi = gl_SubgroupID; + + const uint wgmm = (psc(size) + M * UNROLL_SG_M * UNROLL_WG_M - 1) / (M * UNROLL_SG_M * UNROLL_WG_M); + const uint wgnn = (psc(num_output) + N * UNROLL_SG_N * UNROLL_WG_N - 1) / (N * UNROLL_SG_N * UNROLL_WG_N); + + const uint wgmi = wgi / wgnn; + const uint wgni = wgi % wgnn; + + const uint sgmi = sgi / UNROLL_WG_N; + const uint sgni = sgi % UNROLL_WG_N; + + const uint kk = (psc(num_input) + K - 1) / K; + const uint kk_padded = (kk + UNROLL_SG_K - 1) / UNROLL_SG_K * UNROLL_SG_K; + + if (wgmi >= wgmm) + return; + + const uint si = gl_SubgroupInvocationID; + + const uint mi = (wgmi * UNROLL_WG_M + sgmi) * UNROLL_SG_M; + const uint ni = (wgni * UNROLL_WG_N + sgni) * UNROLL_SG_N; + +#if ncnn_VK_KHR_cooperative_matrix + coopmat sum[UNROLL_SG_N][UNROLL_SG_M]; +#elif ncnn_VK_NV_cooperative_matrix + icoopmatNV<32, gl_ScopeSubgroup, M, N> sum[UNROLL_SG_N][UNROLL_SG_M]; +#endif + + { + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { +#if ncnn_VK_KHR_cooperative_matrix + sum[zn][zm] = coopmat(0); +#elif ncnn_VK_NV_cooperative_matrix + sum[zn][zm] = icoopmatNV<32, gl_ScopeSubgroup, M, N>(0); +#endif + } + } + } + + uint k = 0; + + if (kk >= UNROLL_SG_K * 2) + { + // local stack and shared memory ping-pong + + // prefetch + int prefetch_tmp_v[(UNROLL_SG_M * UNROLL_SG_K * M * Kd4p + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N)]; + int prefetch_tmp_k[(UNROLL_SG_N * UNROLL_SG_K * K * Nd4p + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M)]; + + // prefetch the very first + { + if (elempack == 1) + { + const uint cstepd4 = psc(cstep) / 4; + + const uint Md4_K_USGM = Md4 * K * UNROLL_SG_M; + const uint Md4_K_USGM_USGK = Md4_K_USGM * UNROLL_SG_K; + const uint Md4_K_USGM_USGK_d_subgroupsize = (Md4_K_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N); + [[unroll]] for (uint q = 0; q < Md4_K_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (Md4_K_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < Md4_K_USGM_USGK) + { + const uint zk = siq / Md4_K_USGM; + const uint zmij = siq % Md4_K_USGM; + const uint zm = zmij / (Md4 * K); + const uint ij = zmij % (Md4 * K); + const uint i = ij / Md4; + const uint j = ij % Md4; + + const uint gk = zk * K + i; + const uint gm = (mi + zm) * Md4 + j; + + prefetch_tmp_v[q] = gk < psc(num_input) && gm < cstepd4 ? i8buffer_sm4(bottom_blob_int8_data, gk * cstepd4 + gm) : 0; + } + } + } + else // elempack == 4 + { + const uint M_Kd4p_USGM = M * Kd4p * UNROLL_SG_M; + const uint M_Kd4p_USGM_USGK = M_Kd4p_USGM * UNROLL_SG_K; + const uint M_Kd4p_USGM_USGK_d_subgroupsize = (M_Kd4p_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N); + [[unroll]] for (uint q = 0; q < M_Kd4p_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (M_Kd4p_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4p_USGM_USGK) + { + const uint zk = siq / M_Kd4p_USGM; + const uint zmij = siq % M_Kd4p_USGM; + const uint zmi = zmij / Kd4p; + const uint j = zmij % Kd4p; + + const uint gm = mi * M + zmi; + const uint gk = zk * K + j * 4; + + const int v = gm < psc(size) && gk < psc(num_input) ? i8buffer_sm4(bottom_blob_int8_data, (gk / 4) * psc(cstep) + gm) : 0; + + prefetch_tmp_v[q] = v; + } + } + } + } + { + const uint K_Nd4p_USGN = K * Nd4p * UNROLL_SG_N; + const uint K_Nd4p_USGN_USGK = K_Nd4p_USGN * UNROLL_SG_K; + const uint w_offset = (wgni * kk_padded * UNROLL_WG_N) * K_Nd4p_USGN + sgni * K_Nd4p_USGN_USGK; + const uint K_Nd4p_USGN_USGK_d_subgroupsize = (K_Nd4p_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M); + [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si; + + if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK) + { + prefetch_tmp_k[q] = i8buffer_sm4(weight_data, w_offset + siq); + } + } + } + + for (; k + UNROLL_SG_K < kk; k += UNROLL_SG_K) + { + // copy prefetched tile to shared memory + { + if (elempack == 1) + { + const uint Md4_K_USGM_USGK = Md4 * K * UNROLL_SG_M * UNROLL_SG_K; + const uint Md4_K_USGM_USGK_d_subgroupsize = (Md4_K_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N); + [[unroll]] for (uint q = 0; q < Md4_K_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (Md4_K_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < Md4_K_USGM_USGK) + { + tmp_v[sgmi][siq] = prefetch_tmp_v[q]; + } + } + } + else // elempack == 4 + { + const uint M_Kd4p_USGM_USGK = M * Kd4p * UNROLL_SG_M * UNROLL_SG_K; + const uint M_Kd4p_USGM_USGK_d_subgroupsize = (M_Kd4p_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N); + [[unroll]] for (uint q = 0; q < M_Kd4p_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (M_Kd4p_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4p_USGM_USGK) + { + tmp_v[sgmi][siq] = prefetch_tmp_v[q]; + } + } + } + } + { + const uint K_Nd4p_USGN_USGK = K * Nd4p * UNROLL_SG_N * UNROLL_SG_K; + const uint K_Nd4p_USGN_USGK_d_subgroupsize = (K_Nd4p_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M); + [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si; + + if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK) + { + tmp_k[sgni][siq] = prefetch_tmp_k[q]; + } + } + } + + barrier(); + + // prefetch next tile + const uint ki = k + UNROLL_SG_K; + { + if (elempack == 1) + { + const uint cstepd4 = psc(cstep) / 4; + + const uint Md4_K_USGM = Md4 * K * UNROLL_SG_M; + const uint Md4_K_USGM_USGK = Md4_K_USGM * UNROLL_SG_K; + const uint Md4_K_USGM_USGK_d_subgroupsize = (Md4_K_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N); + [[unroll]] for (uint q = 0; q < Md4_K_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (Md4_K_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < Md4_K_USGM_USGK) + { + const uint zk = siq / Md4_K_USGM; + const uint zmij = siq % Md4_K_USGM; + const uint zm = zmij / (Md4 * K); + const uint ij = zmij % (Md4 * K); + const uint i = ij / Md4; + const uint j = ij % Md4; + + const uint gk = (ki + zk) * K + i; + const uint gm = (mi + zm) * Md4 + j; + + prefetch_tmp_v[q] = gk < psc(num_input) && gm < cstepd4 ? i8buffer_sm4(bottom_blob_int8_data, gk * cstepd4 + gm) : 0; + } + } + } + else // elempack == 4 + { + const uint M_Kd4p_USGM = M * Kd4p * UNROLL_SG_M; + const uint M_Kd4p_USGM_USGK = M_Kd4p_USGM * UNROLL_SG_K; + const uint M_Kd4p_USGM_USGK_d_subgroupsize = (M_Kd4p_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N); + [[unroll]] for (uint q = 0; q < M_Kd4p_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (M_Kd4p_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4p_USGM_USGK) + { + const uint zk = siq / M_Kd4p_USGM; + const uint zmij = siq % M_Kd4p_USGM; + const uint zmi = zmij / Kd4p; + const uint j = zmij % Kd4p; + + const uint gm = mi * M + zmi; + const uint gk = (ki + zk) * K + j * 4; + + const int v = gm < psc(size) && gk < psc(num_input) ? i8buffer_sm4(bottom_blob_int8_data, (gk / 4) * psc(cstep) + gm) : 0; + + prefetch_tmp_v[q] = v; + } + } + } + } + { + const uint K_Nd4p_USGN = K * Nd4p * UNROLL_SG_N; + const uint K_Nd4p_USGN_USGK = K_Nd4p_USGN * UNROLL_SG_K; + const uint K_Nd4p_USGN_USGK_d_subgroupsize = (K_Nd4p_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M); + + const uint w_offset = (wgni * kk_padded * UNROLL_WG_N) * K_Nd4p_USGN + ((ki / UNROLL_SG_K) * UNROLL_WG_N + sgni) * K_Nd4p_USGN_USGK; + [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si; + + if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK) + { + prefetch_tmp_k[q] = i8buffer_sm4(weight_data, w_offset + siq); + } + } + } + +#if ncnn_VK_KHR_cooperative_matrix + coopmat A[UNROLL_SG_M]; + coopmat B[UNROLL_SG_N]; +#elif ncnn_VK_NV_cooperative_matrix + icoopmatNV<8, gl_ScopeSubgroup, M, K> A[UNROLL_SG_M]; + icoopmatNV<8, gl_ScopeSubgroup, K, N> B[UNROLL_SG_N]; +#endif + + [[unroll]] for (uint zk = 0; zk < UNROLL_SG_K; zk++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { + if (elempack == 1) + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, gl_CooperativeMatrixLayoutColumnMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, true); +#endif + } + else // elempack == 4 + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false); +#endif + } + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(B[zn], tmp_k[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(B[zn], tmp_k[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, false); +#endif + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { +#if ncnn_VK_KHR_cooperative_matrix + sum[zn][zm] = coopMatMulAdd(A[zm], B[zn], sum[zn][zm]); +#elif ncnn_VK_NV_cooperative_matrix + sum[zn][zm] = coopMatMulAddNV(A[zm], B[zn], sum[zn][zm]); +#endif + } + } + } + + barrier(); + } + + // copy and compute the last prefetched tile + { + if (elempack == 1) + { + const uint Md4_K_USGM_USGK = Md4 * K * UNROLL_SG_M * UNROLL_SG_K; + const uint Md4_K_USGM_USGK_d_subgroupsize = (Md4_K_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N); + [[unroll]] for (uint q = 0; q < Md4_K_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (Md4_K_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < Md4_K_USGM_USGK) + { + tmp_v[sgmi][siq] = prefetch_tmp_v[q]; + } + } + } + else // elempack == 4 + { + const uint M_Kd4p_USGM_USGK = M * Kd4p * UNROLL_SG_M * UNROLL_SG_K; + const uint M_Kd4p_USGM_USGK_d_subgroupsize = (M_Kd4p_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N); + [[unroll]] for (uint q = 0; q < M_Kd4p_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (M_Kd4p_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4p_USGM_USGK) + { + tmp_v[sgmi][siq] = prefetch_tmp_v[q]; + } + } + } + } + { + const uint K_Nd4p_USGN_USGK = K * Nd4p * UNROLL_SG_N * UNROLL_SG_K; + const uint K_Nd4p_USGN_USGK_d_subgroupsize = (K_Nd4p_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M); + [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si; + + if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK) + { + tmp_k[sgni][siq] = prefetch_tmp_k[q]; + } + } + } + + barrier(); + +#if ncnn_VK_KHR_cooperative_matrix + coopmat A[UNROLL_SG_M]; + coopmat B[UNROLL_SG_N]; +#elif ncnn_VK_NV_cooperative_matrix + icoopmatNV<8, gl_ScopeSubgroup, M, K> A[UNROLL_SG_M]; + icoopmatNV<8, gl_ScopeSubgroup, K, N> B[UNROLL_SG_N]; +#endif + + [[unroll]] for (uint zk = 0; zk < UNROLL_SG_K; zk++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { + if (elempack == 1) + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, gl_CooperativeMatrixLayoutColumnMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, true); +#endif + } + else // elempack == 4 + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false); +#endif + } + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(B[zn], tmp_k[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(B[zn], tmp_k[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, false); +#endif + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { +#if ncnn_VK_KHR_cooperative_matrix + sum[zn][zm] = coopMatMulAdd(A[zm], B[zn], sum[zn][zm]); +#elif ncnn_VK_NV_cooperative_matrix + sum[zn][zm] = coopMatMulAddNV(A[zm], B[zn], sum[zn][zm]); +#endif + } + } + } + + barrier(); + } + else + { + for (uint ki = 0; ki < kk; ki += UNROLL_SG_K) + { + { + if (elempack == 1) + { + const uint cstepd4 = psc(cstep) / 4; + + const uint Md4_K_USGM = Md4 * K * UNROLL_SG_M; + const uint Md4_K_USGM_USGK = Md4_K_USGM * UNROLL_SG_K; + const uint Md4_K_USGM_USGK_d_subgroupsize = (Md4_K_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N); + [[unroll]] for (uint q = 0; q < Md4_K_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (Md4_K_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < Md4_K_USGM_USGK) + { + const uint zk = siq / Md4_K_USGM; + const uint zmij = siq % Md4_K_USGM; + const uint zm = zmij / (Md4 * K); + const uint ij = zmij % (Md4 * K); + const uint i = ij / Md4; + const uint j = ij % Md4; + + const uint gk = (ki + zk) * K + i; + const uint gm = (mi + zm) * Md4 + j; + + tmp_v[sgmi][siq] = gk < psc(num_input) && gm < cstepd4 ? i8buffer_sm4(bottom_blob_int8_data, gk * cstepd4 + gm) : 0; + } + } + } + else // elempack == 4 + { + const uint M_Kd4p_USGM = M * Kd4p * UNROLL_SG_M; + const uint M_Kd4p_USGM_USGK = M_Kd4p_USGM * UNROLL_SG_K; + const uint M_Kd4p_USGM_USGK_d_subgroupsize = (M_Kd4p_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N); + [[unroll]] for (uint q = 0; q < M_Kd4p_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (M_Kd4p_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4p_USGM_USGK) + { + const uint zk = siq / M_Kd4p_USGM; + const uint zmij = siq % M_Kd4p_USGM; + const uint zm = zmij / (M * Kd4p); + const uint ij = zmij % (M * Kd4p); + const uint i = ij / Kd4p; + const uint j = ij % Kd4p; + + const uint gm = (mi + zm) * M + i; + const uint gk = (ki + zk) * K + j * 4; + const int v = gm < psc(size) && gk < psc(num_input) ? i8buffer_sm4(bottom_blob_int8_data, (gk / 4) * psc(cstep) + gm) : 0; + + tmp_v[sgmi][siq] = v; + } + } + } + } + + { + const uint K_Nd4p_USGN = K * Nd4p * UNROLL_SG_N; + const uint K_Nd4p_USGN_USGK = K_Nd4p_USGN * UNROLL_SG_K; + const uint K_Nd4p_USGN_USGK_d_subgroupsize = (K_Nd4p_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M); + const uint w_offset = (wgni * kk_padded * UNROLL_WG_N) * K_Nd4p_USGN + ((ki / UNROLL_SG_K) * UNROLL_WG_N + sgni) * K_Nd4p_USGN_USGK; + [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si; + + if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK) + { + tmp_k[sgni][siq] = i8buffer_sm4(weight_data, w_offset + siq); + } + } + } + + barrier(); + +#if ncnn_VK_KHR_cooperative_matrix + coopmat A[UNROLL_SG_M]; + coopmat B[UNROLL_SG_N]; +#elif ncnn_VK_NV_cooperative_matrix + icoopmatNV<8, gl_ScopeSubgroup, M, K> A[UNROLL_SG_M]; + icoopmatNV<8, gl_ScopeSubgroup, K, N> B[UNROLL_SG_N]; +#endif + + [[unroll]] for (uint zk = 0; zk < UNROLL_SG_K; zk++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { + if (elempack == 1) + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, gl_CooperativeMatrixLayoutColumnMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, true); +#endif + } + else // elempack == 4 + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false); +#endif + } + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(B[zn], tmp_k[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(B[zn], tmp_k[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, false); +#endif + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { +#if ncnn_VK_KHR_cooperative_matrix + sum[zn][zm] = coopMatMulAdd(A[zm], B[zn], sum[zn][zm]); +#elif ncnn_VK_NV_cooperative_matrix + sum[zn][zm] = coopMatMulAddNV(A[zm], B[zn], sum[zn][zm]); +#endif + } + } + } + + barrier(); + } + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { + if (out_elempack == 1) + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatStore(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (Md4 * N), Md4, gl_CooperativeMatrixLayoutColumnMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatStoreNV(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (Md4 * N), Md4, true); +#endif + } + else // out_elempack == 4 + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatStore(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (M * Nd4), Nd4, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatStoreNV(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (M * Nd4), Nd4, false); +#endif + } + } + } + + barrier(); + + if (out_elempack == 1) + { + const uint outcstepd4 = psc(outcstep) / 4; + const uint Md4_N_USGM_USGN = Md4 * N * UNROLL_SG_M * UNROLL_SG_N; + const uint Md4_N_USGM_USGN_d_subgroupsize = (Md4_N_USGM_USGN + subgroup_size - 1) / subgroup_size; + [[unroll]] for (uint q = 0; q < Md4_N_USGM_USGN_d_subgroupsize; q++) + { + const uint siq = si + q * subgroup_size; + + if (Md4_N_USGM_USGN % subgroup_size == 0 || siq < Md4_N_USGM_USGN) + { + const uint zn = siq / (Md4 * N * UNROLL_SG_M); + const uint zmij = siq % (Md4 * N * UNROLL_SG_M); + const uint zm = zmij / (Md4 * N); + const uint ij = zmij % (Md4 * N); + const uint i = ij / Md4; + const uint j = ij % Md4; + + const uint gn = (ni + zn) * N + i; + const uint gm = (mi + zm) * Md4 + j; + + if (gn < psc(num_output) && gm * 4 < psc(size)) + { + const ivec4 sumi = tmp_o[sgi][siq]; + const int gn4 = int(gn % 4); + vec4 sumfp = vec4(sumi) * weight_descales_data[gn / 4][gn4]; + + if (bias_term == 1) + { + sumfp += bias_data[gn / 4][gn4]; + } + + sumfp = vec4(activation_afpvec4(afpvec4(sumfp), activation_type, activation_param_0, activation_param_1)); + + if (use_int8_requantize == 1) + { + const float top_scale = float(buffer_ld1(top_scales_data, 0)); + sumfp *= top_scale; + i8buffer_st4(top_blob_int8_data, gn * outcstepd4 + gm, float2int8vec4(sumfp)); + } + else + { + buffer_st4(top_blob_data, gn * outcstepd4 + gm, afpvec4(sumfp)); + } + } + } + } + } + else // out_elempack == 4 + { + const uint M_Nd4_USGM_USGN = M * Nd4 * UNROLL_SG_M * UNROLL_SG_N; + const uint M_Nd4_USGM_USGN_d_subgroupsize = (M_Nd4_USGM_USGN + subgroup_size - 1) / subgroup_size; + [[unroll]] for (uint q = 0; q < M_Nd4_USGM_USGN_d_subgroupsize; q++) + { + const uint siq = si + q * subgroup_size; + + if (M_Nd4_USGM_USGN % subgroup_size == 0 || siq < M_Nd4_USGM_USGN) + { + const uint zn = siq / (M * Nd4 * UNROLL_SG_M); + const uint zmij = siq % (M * Nd4 * UNROLL_SG_M); + const uint zm = zmij / (M * Nd4); + const uint ij = zmij % (M * Nd4); + const uint i = ij / Nd4; + const uint j = ij % Nd4; + + const uint gm = (mi + zm) * M + i; + const uint gn = (ni + zn) * N + j * 4; + + if (gm < psc(size) && gn < psc(num_output)) + { + const ivec4 sumi = tmp_o[sgi][siq]; + vec4 sumfp = vec4(sumi) * weight_descales_data[gn / 4]; + + if (bias_term == 1) + { + sumfp += bias_data[gn / 4]; + } + + sumfp = vec4(activation_afpvec4(afpvec4(sumfp), activation_type, activation_param_0, activation_param_1)); + + if (use_int8_requantize == 1) + { + const float top_scale = float(buffer_ld1(top_scales_data, 0)); + sumfp *= top_scale; + i8buffer_st4(top_blob_int8_data, (gn / 4) * psc(outcstep) + gm, float2int8vec4(sumfp)); + } + else + { + buffer_st4(top_blob_data, (gn / 4) * psc(outcstep) + gm, afpvec4(sumfp)); + } + } + } + } + } +} diff --git a/src/layer/vulkan/shader/convolution_3x3s1d1_winograd23_transform_input_int8.comp b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd23_transform_input_int8.comp new file mode 100644 index 000000000000..1607abb48fec --- /dev/null +++ b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd23_transform_input_int8.comp @@ -0,0 +1,131 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout(constant_id = 0) const int c = 0; + +#define shape_constant_id_offset 1 +layout(constant_id = shape_constant_id_offset + 0) const int w = 0; +layout(constant_id = shape_constant_id_offset + 1) const int h = 0; +layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0; + +layout(constant_id = shape_constant_id_offset + 3) const int outcstep = 0; + +layout(constant_id = shape_constant_id_offset + 4) const int block_x = 0; +layout(constant_id = shape_constant_id_offset + 5) const int block_y = 0; + +layout(binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_int8_data[]; }; +layout(binding = 1) writeonly buffer bottom_tm_blob { sint16 bottom_tm_blob_data[]; }; + +layout(push_constant) uniform parameter +{ + int w; + int h; + int cstep; + + int outcstep; + + int block_x; + int block_y; + int c; +} p; + +void main() +{ + const int gx = int(gl_GlobalInvocationID.x); + const int gy = int(gl_GlobalInvocationID.y); + const int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= c) + return; + + // load 4x4 + int sx = gx * 2; + int sy = gy * 2; + + int v_offset_0 = gz * psc(cstep) + sy * psc(w) + sx; + ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * psc(w); + + int v00 = i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 0); + int v01 = sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 1) : 0; + int v02 = sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 2) : 0; + int v03 = sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 3) : 0; + + int v10 = sy + 1 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 0) : 0; + int v11 = sy + 1 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 1) : 0; + int v12 = sy + 1 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 2) : 0; + int v13 = sy + 1 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 3) : 0; + + int v20 = sy + 2 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 0) : 0; + int v21 = sy + 2 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 1) : 0; + int v22 = sy + 2 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 2) : 0; + int v23 = sy + 2 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 3) : 0; + + int v30 = sy + 3 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 0) : 0; + int v31 = sy + 3 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 1) : 0; + int v32 = sy + 3 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 2) : 0; + int v33 = sy + 3 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 3) : 0; + + // implicit transpose + int m00 = v00 - v02; + int m01 = v10 - v12; + int m02 = v20 - v22; + int m03 = v30 - v32; + + int m10 = v02 + v01; + int m11 = v12 + v11; + int m12 = v22 + v21; + int m13 = v32 + v31; + + int m20 = v02 - v01; + int m21 = v12 - v11; + int m22 = v22 - v21; + int m23 = v32 - v31; + + int m30 = v03 - v01; + int m31 = v13 - v11; + int m32 = v23 - v21; + int m33 = v33 - v31; + + v00 = m00 - m02; + v10 = m10 - m12; + v20 = m20 - m22; + v30 = m30 - m32; + + v01 = m02 + m01; + v11 = m12 + m11; + v21 = m22 + m21; + v31 = m32 + m31; + + v02 = m02 - m01; + v12 = m12 - m11; + v22 = m22 - m21; + v32 = m32 - m31; + + v03 = m03 - m01; + v13 = m13 - m11; + v23 = m23 - m21; + v33 = m33 - m31; + + // store 16 + int v_tm_offset = gz * psc(outcstep) + gy * psc(block_x) + gx; + int v_tm_step = psc(outcstep) * c; + + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 0 * v_tm_step, v00); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 1 * v_tm_step, v01); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 2 * v_tm_step, v02); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 3 * v_tm_step, v03); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 4 * v_tm_step, v10); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 5 * v_tm_step, v11); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 6 * v_tm_step, v12); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 7 * v_tm_step, v13); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 8 * v_tm_step, v20); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 9 * v_tm_step, v21); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 10 * v_tm_step, v22); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 11 * v_tm_step, v23); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 12 * v_tm_step, v30); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 13 * v_tm_step, v31); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 14 * v_tm_step, v32); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 15 * v_tm_step, v33); +} diff --git a/src/layer/vulkan/shader/convolution_3x3s1d1_winograd23_transform_input_int8_cm.comp b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd23_transform_input_int8_cm.comp new file mode 100644 index 000000000000..0249e01fac1d --- /dev/null +++ b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd23_transform_input_int8_cm.comp @@ -0,0 +1,180 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout(constant_id = 0) const int c = 0; + +#define shape_constant_id_offset 1 +layout(constant_id = shape_constant_id_offset + 0) const int w = 0; +layout(constant_id = shape_constant_id_offset + 1) const int h = 0; +layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0; + +layout(constant_id = shape_constant_id_offset + 3) const int outcstep = 0; + +layout(constant_id = shape_constant_id_offset + 4) const int block_x = 0; +layout(constant_id = shape_constant_id_offset + 5) const int block_y = 0; + +layout(binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_int8_data[]; }; +layout(binding = 1) writeonly buffer bottom_tm_low_blob { sint8 bottom_tm_low_data[]; }; +layout(binding = 2) writeonly buffer bottom_tm_high_blob { sint8 bottom_tm_high_data[]; }; + +layout(push_constant) uniform parameter +{ + int w; + int h; + int cstep; + + int outcstep; + + int block_x; + int block_y; + int c; +} p; + +void main() +{ + const int gx = int(gl_GlobalInvocationID.x); + const int gy = int(gl_GlobalInvocationID.y); + const int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= c) + return; + + // load 4x4 + int sx = gx * 2; + int sy = gy * 2; + + int v_offset_0 = gz * psc(cstep) + sy * psc(w) + sx; + ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * psc(w); + + int v00 = i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 0); + int v01 = sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 1) : 0; + int v02 = sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 2) : 0; + int v03 = sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 3) : 0; + + int v10 = sy + 1 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 0) : 0; + int v11 = sy + 1 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 1) : 0; + int v12 = sy + 1 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 2) : 0; + int v13 = sy + 1 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 3) : 0; + + int v20 = sy + 2 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 0) : 0; + int v21 = sy + 2 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 1) : 0; + int v22 = sy + 2 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 2) : 0; + int v23 = sy + 2 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 3) : 0; + + int v30 = sy + 3 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 0) : 0; + int v31 = sy + 3 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 1) : 0; + int v32 = sy + 3 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 2) : 0; + int v33 = sy + 3 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 3) : 0; + + // implicit transpose + int m00 = v00 - v02; + int m01 = v10 - v12; + int m02 = v20 - v22; + int m03 = v30 - v32; + + int m10 = v02 + v01; + int m11 = v12 + v11; + int m12 = v22 + v21; + int m13 = v32 + v31; + + int m20 = v02 - v01; + int m21 = v12 - v11; + int m22 = v22 - v21; + int m23 = v32 - v31; + + int m30 = v03 - v01; + int m31 = v13 - v11; + int m32 = v23 - v21; + int m33 = v33 - v31; + + v00 = m00 - m02; + v10 = m10 - m12; + v20 = m20 - m22; + v30 = m30 - m32; + + v01 = m02 + m01; + v11 = m12 + m11; + v21 = m22 + m21; + v31 = m32 + m31; + + v02 = m02 - m01; + v12 = m12 - m11; + v22 = m22 - m21; + v32 = m32 - m31; + + v03 = m03 - m01; + v13 = m13 - m11; + v23 = m23 - m21; + v33 = m33 - m31; + + // store 16 + int v_tm_offset = gz * psc(outcstep) + gy * psc(block_x) + gx; + int v_tm_step = psc(outcstep) * c; + + int v00_low = v00 & 255; + v00_low = v00_low >= 128 ? v00_low - 256 : v00_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 0 * v_tm_step, v00_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 0 * v_tm_step, (v00 - v00_low) >> 8); + int v01_low = v01 & 255; + v01_low = v01_low >= 128 ? v01_low - 256 : v01_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 1 * v_tm_step, v01_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 1 * v_tm_step, (v01 - v01_low) >> 8); + int v02_low = v02 & 255; + v02_low = v02_low >= 128 ? v02_low - 256 : v02_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 2 * v_tm_step, v02_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 2 * v_tm_step, (v02 - v02_low) >> 8); + int v03_low = v03 & 255; + v03_low = v03_low >= 128 ? v03_low - 256 : v03_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 3 * v_tm_step, v03_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 3 * v_tm_step, (v03 - v03_low) >> 8); + int v10_low = v10 & 255; + v10_low = v10_low >= 128 ? v10_low - 256 : v10_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 4 * v_tm_step, v10_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 4 * v_tm_step, (v10 - v10_low) >> 8); + int v11_low = v11 & 255; + v11_low = v11_low >= 128 ? v11_low - 256 : v11_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 5 * v_tm_step, v11_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 5 * v_tm_step, (v11 - v11_low) >> 8); + int v12_low = v12 & 255; + v12_low = v12_low >= 128 ? v12_low - 256 : v12_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 6 * v_tm_step, v12_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 6 * v_tm_step, (v12 - v12_low) >> 8); + int v13_low = v13 & 255; + v13_low = v13_low >= 128 ? v13_low - 256 : v13_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 7 * v_tm_step, v13_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 7 * v_tm_step, (v13 - v13_low) >> 8); + int v20_low = v20 & 255; + v20_low = v20_low >= 128 ? v20_low - 256 : v20_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 8 * v_tm_step, v20_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 8 * v_tm_step, (v20 - v20_low) >> 8); + int v21_low = v21 & 255; + v21_low = v21_low >= 128 ? v21_low - 256 : v21_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 9 * v_tm_step, v21_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 9 * v_tm_step, (v21 - v21_low) >> 8); + int v22_low = v22 & 255; + v22_low = v22_low >= 128 ? v22_low - 256 : v22_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 10 * v_tm_step, v22_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 10 * v_tm_step, (v22 - v22_low) >> 8); + int v23_low = v23 & 255; + v23_low = v23_low >= 128 ? v23_low - 256 : v23_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 11 * v_tm_step, v23_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 11 * v_tm_step, (v23 - v23_low) >> 8); + int v30_low = v30 & 255; + v30_low = v30_low >= 128 ? v30_low - 256 : v30_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 12 * v_tm_step, v30_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 12 * v_tm_step, (v30 - v30_low) >> 8); + int v31_low = v31 & 255; + v31_low = v31_low >= 128 ? v31_low - 256 : v31_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 13 * v_tm_step, v31_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 13 * v_tm_step, (v31 - v31_low) >> 8); + int v32_low = v32 & 255; + v32_low = v32_low >= 128 ? v32_low - 256 : v32_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 14 * v_tm_step, v32_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 14 * v_tm_step, (v32 - v32_low) >> 8); + int v33_low = v33 & 255; + v33_low = v33_low >= 128 ? v33_low - 256 : v33_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 15 * v_tm_step, v33_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 15 * v_tm_step, (v33 - v33_low) >> 8); +} diff --git a/src/layer/vulkan/shader/convolution_3x3s1d1_winograd23_transform_output_int8.comp b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd23_transform_output_int8.comp new file mode 100644 index 000000000000..206409c65f9c --- /dev/null +++ b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd23_transform_output_int8.comp @@ -0,0 +1,140 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#extension GL_GOOGLE_include_directive : enable +#include "vulkan_activation.comp" + +layout(constant_id = 0) const int bias_term = 0; +layout(constant_id = 1) const int activation_type = 0; +layout(constant_id = 2) const float activation_param_0 = 0; +layout(constant_id = 3) const float activation_param_1 = 0; +layout(constant_id = 4) const int use_int8_requantize = 0; +layout(binding = 0) readonly buffer top_tm_blob { int top_tm_blob_data[]; }; +layout(binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +layout(binding = 2) readonly buffer bias_blob { float bias_data[]; }; +layout(binding = 3) readonly buffer weight_descales_blob { float weight_descales_data[]; }; +layout(binding = 4) readonly buffer top_scales_blob { sfp top_scales_data[]; }; +layout(binding = 5) writeonly buffer top_blob_int8 { sint8 top_blob_int8_data[]; }; + +layout(push_constant) uniform parameter +{ + int cstep; + + int block_x; + int block_y; + + int outw; + int outh; + int outcstep; + int outc; +} p; + +void main() +{ + const int gx = int(gl_GlobalInvocationID.x); + const int gy = int(gl_GlobalInvocationID.y); + const int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.block_x || gy >= p.block_y || gz >= p.outc) + return; + + // load 16 + int v_tm_offset = gz * p.cstep + gy * p.block_x + gx; + int v_tm_step = p.cstep * p.outc; + + int v00 = top_tm_blob_data[v_tm_offset + 0 * v_tm_step]; + int v01 = top_tm_blob_data[v_tm_offset + 1 * v_tm_step]; + int v02 = top_tm_blob_data[v_tm_offset + 2 * v_tm_step]; + int v03 = top_tm_blob_data[v_tm_offset + 3 * v_tm_step]; + int v10 = top_tm_blob_data[v_tm_offset + 4 * v_tm_step]; + int v11 = top_tm_blob_data[v_tm_offset + 5 * v_tm_step]; + int v12 = top_tm_blob_data[v_tm_offset + 6 * v_tm_step]; + int v13 = top_tm_blob_data[v_tm_offset + 7 * v_tm_step]; + int v20 = top_tm_blob_data[v_tm_offset + 8 * v_tm_step]; + int v21 = top_tm_blob_data[v_tm_offset + 9 * v_tm_step]; + int v22 = top_tm_blob_data[v_tm_offset + 10 * v_tm_step]; + int v23 = top_tm_blob_data[v_tm_offset + 11 * v_tm_step]; + int v30 = top_tm_blob_data[v_tm_offset + 12 * v_tm_step]; + int v31 = top_tm_blob_data[v_tm_offset + 13 * v_tm_step]; + int v32 = top_tm_blob_data[v_tm_offset + 14 * v_tm_step]; + int v33 = top_tm_blob_data[v_tm_offset + 15 * v_tm_step]; + + int m00 = v00 + v01 + v02; + int m01 = v10 + v11 + v12; + int m02 = v20 + v21 + v22; + int m03 = v30 + v31 + v32; + + int m10 = v01 - v02 + v03; + int m11 = v11 - v12 + v13; + int m12 = v21 - v22 + v23; + int m13 = v31 - v32 + v33; + + const float descale = weight_descales_data[gz] * 0.25f; + + float out00 = float(m00 + m01 + m02) * descale; + float out01 = float(m01 - m02 + m03) * descale; + float out10 = float(m10 + m11 + m12) * descale; + float out11 = float(m11 - m12 + m13) * descale; + + if (bias_term == 1) + { + const float bias_value = bias_data[gz]; + + out00 += bias_value; + out01 += bias_value; + out10 += bias_value; + out11 += bias_value; + } + + out00 = float(activation_afp(afp(out00), activation_type, activation_param_0, activation_param_1)); + out01 = float(activation_afp(afp(out01), activation_type, activation_param_0, activation_param_1)); + out10 = float(activation_afp(afp(out10), activation_type, activation_param_0, activation_param_1)); + out11 = float(activation_afp(afp(out11), activation_type, activation_param_0, activation_param_1)); + + // store 2x2 + int x = gx * 2; + int y = gy * 2; + + if (use_int8_requantize == 1) + { + const float top_scale = buffer_ld1(top_scales_data, 0); + + out00 *= top_scale; + out01 *= top_scale; + out10 *= top_scale; + out11 *= top_scale; + + int out00_int8 = float2int8(out00); + int out01_int8 = float2int8(out01); + int out10_int8 = float2int8(out10); + int out11_int8 = float2int8(out11); + + int v_offset_0 = gz * p.outcstep + y * p.outw + x; + int v_offset_1 = v_offset_0 + p.outw; + + i8buffer_st1(top_blob_int8_data, v_offset_0 + 0, out00_int8); + if (x + 1 < p.outw) i8buffer_st1(top_blob_int8_data, v_offset_0 + 1, out01_int8); + + if (y + 1 < p.outh) + { + i8buffer_st1(top_blob_int8_data, v_offset_1 + 0, out10_int8); + if (x + 1 < p.outw) i8buffer_st1(top_blob_int8_data, v_offset_1 + 1, out11_int8); + } + } + else + { + int v_offset_0 = gz * p.outcstep + y * p.outw + x; + int v_offset_1 = v_offset_0 + p.outw; + + buffer_st1(top_blob_data, v_offset_0 + 0, afp(out00)); + if (x + 1 < p.outw) buffer_st1(top_blob_data, v_offset_0 + 1, afp(out01)); + + if (y + 1 < p.outh) + { + buffer_st1(top_blob_data, v_offset_1 + 0, afp(out10)); + if (x + 1 < p.outw) buffer_st1(top_blob_data, v_offset_1 + 1, afp(out11)); + } + } +} diff --git a/src/layer/vulkan/shader/convolution_3x3s1d1_winograd43_transform_input_int8.comp b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd43_transform_input_int8.comp new file mode 100644 index 000000000000..578079b2aca7 --- /dev/null +++ b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd43_transform_input_int8.comp @@ -0,0 +1,219 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout(constant_id = 0) const int c = 0; + +#define shape_constant_id_offset 1 +layout(constant_id = shape_constant_id_offset + 0) const int w = 0; +layout(constant_id = shape_constant_id_offset + 1) const int h = 0; +layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0; + +layout(constant_id = shape_constant_id_offset + 3) const int outcstep = 0; + +layout(constant_id = shape_constant_id_offset + 4) const int block_x = 0; +layout(constant_id = shape_constant_id_offset + 5) const int block_y = 0; + +layout(binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_int8_data[]; }; +layout(binding = 1) writeonly buffer bottom_tm_blob { sint16 bottom_tm_blob_data[]; }; + +layout(push_constant) uniform parameter +{ + int w; + int h; + int cstep; + + int outcstep; + + int block_x; + int block_y; + + int c; +} p; + +void main() +{ + const int gx = int(gl_GlobalInvocationID.x); + const int gy = int(gl_GlobalInvocationID.y); + const int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= c) + return; + + // load 6x6 + int sx = gx * 4; + int sy = gy * 4; + + int v_offset_0 = gz * psc(cstep) + sy * psc(w) + sx; + ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * psc(w); + ivec2 v_offset45 = v_offset_0 + ivec2(4, 5) * psc(w); + + int v00 = i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 0); + int v01 = sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 1) : 0; + int v02 = sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 2) : 0; + int v03 = sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 3) : 0; + int v04 = sx + 4 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 4) : 0; + int v05 = sx + 5 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 5) : 0; + + int v10 = sy + 1 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 0) : 0; + int v11 = sy + 1 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 1) : 0; + int v12 = sy + 1 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 2) : 0; + int v13 = sy + 1 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 3) : 0; + int v14 = sy + 1 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 4) : 0; + int v15 = sy + 1 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 5) : 0; + + int v20 = sy + 2 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 0) : 0; + int v21 = sy + 2 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 1) : 0; + int v22 = sy + 2 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 2) : 0; + int v23 = sy + 2 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 3) : 0; + int v24 = sy + 2 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 4) : 0; + int v25 = sy + 2 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 5) : 0; + + int v30 = sy + 3 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 0) : 0; + int v31 = sy + 3 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 1) : 0; + int v32 = sy + 3 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 2) : 0; + int v33 = sy + 3 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 3) : 0; + int v34 = sy + 3 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 4) : 0; + int v35 = sy + 3 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 5) : 0; + + int v40 = sy + 4 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.x + 0) : 0; + int v41 = sy + 4 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.x + 1) : 0; + int v42 = sy + 4 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.x + 2) : 0; + int v43 = sy + 4 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.x + 3) : 0; + int v44 = sy + 4 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.x + 4) : 0; + int v45 = sy + 4 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.x + 5) : 0; + + int v50 = sy + 5 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.y + 0) : 0; + int v51 = sy + 5 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.y + 1) : 0; + int v52 = sy + 5 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.y + 2) : 0; + int v53 = sy + 5 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.y + 3) : 0; + int v54 = sy + 5 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.y + 4) : 0; + int v55 = sy + 5 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.y + 5) : 0; + + // implicit transpose + int m00 = v04 + v00 * 4 - v02 * 5; + int m01 = v14 + v10 * 4 - v12 * 5; + int m02 = v24 + v20 * 4 - v22 * 5; + int m03 = v34 + v30 * 4 - v32 * 5; + int m04 = v44 + v40 * 4 - v42 * 5; + int m05 = v54 + v50 * 4 - v52 * 5; + + int m10 = (v04 - v02 * 4) + (v03 - v01 * 4); + int m11 = (v14 - v12 * 4) + (v13 - v11 * 4); + int m12 = (v24 - v22 * 4) + (v23 - v21 * 4); + int m13 = (v34 - v32 * 4) + (v33 - v31 * 4); + int m14 = (v44 - v42 * 4) + (v43 - v41 * 4); + int m15 = (v54 - v52 * 4) + (v53 - v51 * 4); + + int m20 = (v04 - v02 * 4) - (v03 - v01 * 4); + int m21 = (v14 - v12 * 4) - (v13 - v11 * 4); + int m22 = (v24 - v22 * 4) - (v23 - v21 * 4); + int m23 = (v34 - v32 * 4) - (v33 - v31 * 4); + int m24 = (v44 - v42 * 4) - (v43 - v41 * 4); + int m25 = (v54 - v52 * 4) - (v53 - v51 * 4); + + int m30 = (v04 - v02) + (v03 - v01) * 2; + int m31 = (v14 - v12) + (v13 - v11) * 2; + int m32 = (v24 - v22) + (v23 - v21) * 2; + int m33 = (v34 - v32) + (v33 - v31) * 2; + int m34 = (v44 - v42) + (v43 - v41) * 2; + int m35 = (v54 - v52) + (v53 - v51) * 2; + + int m40 = (v04 - v02) - (v03 - v01) * 2; + int m41 = (v14 - v12) - (v13 - v11) * 2; + int m42 = (v24 - v22) - (v23 - v21) * 2; + int m43 = (v34 - v32) - (v33 - v31) * 2; + int m44 = (v44 - v42) - (v43 - v41) * 2; + int m45 = (v54 - v52) - (v53 - v51) * 2; + + int m50 = v05 + v01 * 4 - v03 * 5; + int m51 = v15 + v11 * 4 - v13 * 5; + int m52 = v25 + v21 * 4 - v23 * 5; + int m53 = v35 + v31 * 4 - v33 * 5; + int m54 = v45 + v41 * 4 - v43 * 5; + int m55 = v55 + v51 * 4 - v53 * 5; + + v00 = m04 + m00 * 4 - m02 * 5; + v10 = m14 + m10 * 4 - m12 * 5; + v20 = m24 + m20 * 4 - m22 * 5; + v30 = m34 + m30 * 4 - m32 * 5; + v40 = m44 + m40 * 4 - m42 * 5; + v50 = m54 + m50 * 4 - m52 * 5; + + v01 = (m04 - m02 * 4) + (m03 - m01 * 4); + v11 = (m14 - m12 * 4) + (m13 - m11 * 4); + v21 = (m24 - m22 * 4) + (m23 - m21 * 4); + v31 = (m34 - m32 * 4) + (m33 - m31 * 4); + v41 = (m44 - m42 * 4) + (m43 - m41 * 4); + v51 = (m54 - m52 * 4) + (m53 - m51 * 4); + + v02 = (m04 - m02 * 4) - (m03 - m01 * 4); + v12 = (m14 - m12 * 4) - (m13 - m11 * 4); + v22 = (m24 - m22 * 4) - (m23 - m21 * 4); + v32 = (m34 - m32 * 4) - (m33 - m31 * 4); + v42 = (m44 - m42 * 4) - (m43 - m41 * 4); + v52 = (m54 - m52 * 4) - (m53 - m51 * 4); + + v03 = (m04 - m02) + (m03 - m01) * 2; + v13 = (m14 - m12) + (m13 - m11) * 2; + v23 = (m24 - m22) + (m23 - m21) * 2; + v33 = (m34 - m32) + (m33 - m31) * 2; + v43 = (m44 - m42) + (m43 - m41) * 2; + v53 = (m54 - m52) + (m53 - m51) * 2; + + v04 = (m04 - m02) - (m03 - m01) * 2; + v14 = (m14 - m12) - (m13 - m11) * 2; + v24 = (m24 - m22) - (m23 - m21) * 2; + v34 = (m34 - m32) - (m33 - m31) * 2; + v44 = (m44 - m42) - (m43 - m41) * 2; + v54 = (m54 - m52) - (m53 - m51) * 2; + + v05 = m05 + m01 * 4 - m03 * 5; + v15 = m15 + m11 * 4 - m13 * 5; + v25 = m25 + m21 * 4 - m23 * 5; + v35 = m35 + m31 * 4 - m33 * 5; + v45 = m45 + m41 * 4 - m43 * 5; + v55 = m55 + m51 * 4 - m53 * 5; + + // store 36 + int v_tm_offset = gz * psc(outcstep) + gy * psc(block_x) + gx; + int v_tm_step = psc(outcstep) * c; + + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 0 * v_tm_step, v00); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 1 * v_tm_step, v01); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 2 * v_tm_step, v02); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 3 * v_tm_step, v03); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 4 * v_tm_step, v04); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 5 * v_tm_step, v05); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 6 * v_tm_step, v10); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 7 * v_tm_step, v11); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 8 * v_tm_step, v12); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 9 * v_tm_step, v13); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 10 * v_tm_step, v14); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 11 * v_tm_step, v15); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 12 * v_tm_step, v20); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 13 * v_tm_step, v21); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 14 * v_tm_step, v22); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 15 * v_tm_step, v23); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 16 * v_tm_step, v24); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 17 * v_tm_step, v25); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 18 * v_tm_step, v30); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 19 * v_tm_step, v31); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 20 * v_tm_step, v32); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 21 * v_tm_step, v33); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 22 * v_tm_step, v34); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 23 * v_tm_step, v35); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 24 * v_tm_step, v40); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 25 * v_tm_step, v41); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 26 * v_tm_step, v42); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 27 * v_tm_step, v43); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 28 * v_tm_step, v44); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 29 * v_tm_step, v45); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 30 * v_tm_step, v50); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 31 * v_tm_step, v51); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 32 * v_tm_step, v52); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 33 * v_tm_step, v53); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 34 * v_tm_step, v54); + i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 35 * v_tm_step, v55); +} diff --git a/src/layer/vulkan/shader/convolution_3x3s1d1_winograd43_transform_input_int8_cm.comp b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd43_transform_input_int8_cm.comp new file mode 100644 index 000000000000..1e82a72d20e3 --- /dev/null +++ b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd43_transform_input_int8_cm.comp @@ -0,0 +1,328 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout(constant_id = 0) const int c = 0; + +#define shape_constant_id_offset 1 +layout(constant_id = shape_constant_id_offset + 0) const int w = 0; +layout(constant_id = shape_constant_id_offset + 1) const int h = 0; +layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0; + +layout(constant_id = shape_constant_id_offset + 3) const int outcstep = 0; + +layout(constant_id = shape_constant_id_offset + 4) const int block_x = 0; +layout(constant_id = shape_constant_id_offset + 5) const int block_y = 0; + +layout(binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_int8_data[]; }; +layout(binding = 1) writeonly buffer bottom_tm_low_blob { sint8 bottom_tm_low_data[]; }; +layout(binding = 2) writeonly buffer bottom_tm_high_blob { sint8 bottom_tm_high_data[]; }; + +layout(push_constant) uniform parameter +{ + int w; + int h; + int cstep; + + int outcstep; + + int block_x; + int block_y; + + int c; +} p; + +void main() +{ + const int gx = int(gl_GlobalInvocationID.x); + const int gy = int(gl_GlobalInvocationID.y); + const int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= c) + return; + + // load 6x6 + int sx = gx * 4; + int sy = gy * 4; + + int v_offset_0 = gz * psc(cstep) + sy * psc(w) + sx; + ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * psc(w); + ivec2 v_offset45 = v_offset_0 + ivec2(4, 5) * psc(w); + + int v00 = i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 0); + int v01 = sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 1) : 0; + int v02 = sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 2) : 0; + int v03 = sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 3) : 0; + int v04 = sx + 4 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 4) : 0; + int v05 = sx + 5 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 5) : 0; + + int v10 = sy + 1 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 0) : 0; + int v11 = sy + 1 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 1) : 0; + int v12 = sy + 1 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 2) : 0; + int v13 = sy + 1 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 3) : 0; + int v14 = sy + 1 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 4) : 0; + int v15 = sy + 1 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 5) : 0; + + int v20 = sy + 2 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 0) : 0; + int v21 = sy + 2 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 1) : 0; + int v22 = sy + 2 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 2) : 0; + int v23 = sy + 2 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 3) : 0; + int v24 = sy + 2 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 4) : 0; + int v25 = sy + 2 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 5) : 0; + + int v30 = sy + 3 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 0) : 0; + int v31 = sy + 3 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 1) : 0; + int v32 = sy + 3 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 2) : 0; + int v33 = sy + 3 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 3) : 0; + int v34 = sy + 3 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 4) : 0; + int v35 = sy + 3 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 5) : 0; + + int v40 = sy + 4 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.x + 0) : 0; + int v41 = sy + 4 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.x + 1) : 0; + int v42 = sy + 4 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.x + 2) : 0; + int v43 = sy + 4 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.x + 3) : 0; + int v44 = sy + 4 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.x + 4) : 0; + int v45 = sy + 4 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.x + 5) : 0; + + int v50 = sy + 5 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.y + 0) : 0; + int v51 = sy + 5 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.y + 1) : 0; + int v52 = sy + 5 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.y + 2) : 0; + int v53 = sy + 5 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.y + 3) : 0; + int v54 = sy + 5 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.y + 4) : 0; + int v55 = sy + 5 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.y + 5) : 0; + + // implicit transpose + int m00 = v04 + v00 * 4 - v02 * 5; + int m01 = v14 + v10 * 4 - v12 * 5; + int m02 = v24 + v20 * 4 - v22 * 5; + int m03 = v34 + v30 * 4 - v32 * 5; + int m04 = v44 + v40 * 4 - v42 * 5; + int m05 = v54 + v50 * 4 - v52 * 5; + + int m10 = (v04 - v02 * 4) + (v03 - v01 * 4); + int m11 = (v14 - v12 * 4) + (v13 - v11 * 4); + int m12 = (v24 - v22 * 4) + (v23 - v21 * 4); + int m13 = (v34 - v32 * 4) + (v33 - v31 * 4); + int m14 = (v44 - v42 * 4) + (v43 - v41 * 4); + int m15 = (v54 - v52 * 4) + (v53 - v51 * 4); + + int m20 = (v04 - v02 * 4) - (v03 - v01 * 4); + int m21 = (v14 - v12 * 4) - (v13 - v11 * 4); + int m22 = (v24 - v22 * 4) - (v23 - v21 * 4); + int m23 = (v34 - v32 * 4) - (v33 - v31 * 4); + int m24 = (v44 - v42 * 4) - (v43 - v41 * 4); + int m25 = (v54 - v52 * 4) - (v53 - v51 * 4); + + int m30 = (v04 - v02) + (v03 - v01) * 2; + int m31 = (v14 - v12) + (v13 - v11) * 2; + int m32 = (v24 - v22) + (v23 - v21) * 2; + int m33 = (v34 - v32) + (v33 - v31) * 2; + int m34 = (v44 - v42) + (v43 - v41) * 2; + int m35 = (v54 - v52) + (v53 - v51) * 2; + + int m40 = (v04 - v02) - (v03 - v01) * 2; + int m41 = (v14 - v12) - (v13 - v11) * 2; + int m42 = (v24 - v22) - (v23 - v21) * 2; + int m43 = (v34 - v32) - (v33 - v31) * 2; + int m44 = (v44 - v42) - (v43 - v41) * 2; + int m45 = (v54 - v52) - (v53 - v51) * 2; + + int m50 = v05 + v01 * 4 - v03 * 5; + int m51 = v15 + v11 * 4 - v13 * 5; + int m52 = v25 + v21 * 4 - v23 * 5; + int m53 = v35 + v31 * 4 - v33 * 5; + int m54 = v45 + v41 * 4 - v43 * 5; + int m55 = v55 + v51 * 4 - v53 * 5; + + v00 = m04 + m00 * 4 - m02 * 5; + v10 = m14 + m10 * 4 - m12 * 5; + v20 = m24 + m20 * 4 - m22 * 5; + v30 = m34 + m30 * 4 - m32 * 5; + v40 = m44 + m40 * 4 - m42 * 5; + v50 = m54 + m50 * 4 - m52 * 5; + + v01 = (m04 - m02 * 4) + (m03 - m01 * 4); + v11 = (m14 - m12 * 4) + (m13 - m11 * 4); + v21 = (m24 - m22 * 4) + (m23 - m21 * 4); + v31 = (m34 - m32 * 4) + (m33 - m31 * 4); + v41 = (m44 - m42 * 4) + (m43 - m41 * 4); + v51 = (m54 - m52 * 4) + (m53 - m51 * 4); + + v02 = (m04 - m02 * 4) - (m03 - m01 * 4); + v12 = (m14 - m12 * 4) - (m13 - m11 * 4); + v22 = (m24 - m22 * 4) - (m23 - m21 * 4); + v32 = (m34 - m32 * 4) - (m33 - m31 * 4); + v42 = (m44 - m42 * 4) - (m43 - m41 * 4); + v52 = (m54 - m52 * 4) - (m53 - m51 * 4); + + v03 = (m04 - m02) + (m03 - m01) * 2; + v13 = (m14 - m12) + (m13 - m11) * 2; + v23 = (m24 - m22) + (m23 - m21) * 2; + v33 = (m34 - m32) + (m33 - m31) * 2; + v43 = (m44 - m42) + (m43 - m41) * 2; + v53 = (m54 - m52) + (m53 - m51) * 2; + + v04 = (m04 - m02) - (m03 - m01) * 2; + v14 = (m14 - m12) - (m13 - m11) * 2; + v24 = (m24 - m22) - (m23 - m21) * 2; + v34 = (m34 - m32) - (m33 - m31) * 2; + v44 = (m44 - m42) - (m43 - m41) * 2; + v54 = (m54 - m52) - (m53 - m51) * 2; + + v05 = m05 + m01 * 4 - m03 * 5; + v15 = m15 + m11 * 4 - m13 * 5; + v25 = m25 + m21 * 4 - m23 * 5; + v35 = m35 + m31 * 4 - m33 * 5; + v45 = m45 + m41 * 4 - m43 * 5; + v55 = m55 + m51 * 4 - m53 * 5; + + // store 36 + int v_tm_offset = gz * psc(outcstep) + gy * psc(block_x) + gx; + int v_tm_step = psc(outcstep) * c; + + int v00_low = v00 & 255; + v00_low = v00_low >= 128 ? v00_low - 256 : v00_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 0 * v_tm_step, v00_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 0 * v_tm_step, (v00 - v00_low) >> 8); + int v01_low = v01 & 255; + v01_low = v01_low >= 128 ? v01_low - 256 : v01_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 1 * v_tm_step, v01_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 1 * v_tm_step, (v01 - v01_low) >> 8); + int v02_low = v02 & 255; + v02_low = v02_low >= 128 ? v02_low - 256 : v02_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 2 * v_tm_step, v02_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 2 * v_tm_step, (v02 - v02_low) >> 8); + int v03_low = v03 & 255; + v03_low = v03_low >= 128 ? v03_low - 256 : v03_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 3 * v_tm_step, v03_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 3 * v_tm_step, (v03 - v03_low) >> 8); + int v04_low = v04 & 255; + v04_low = v04_low >= 128 ? v04_low - 256 : v04_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 4 * v_tm_step, v04_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 4 * v_tm_step, (v04 - v04_low) >> 8); + int v05_low = v05 & 255; + v05_low = v05_low >= 128 ? v05_low - 256 : v05_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 5 * v_tm_step, v05_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 5 * v_tm_step, (v05 - v05_low) >> 8); + int v10_low = v10 & 255; + v10_low = v10_low >= 128 ? v10_low - 256 : v10_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 6 * v_tm_step, v10_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 6 * v_tm_step, (v10 - v10_low) >> 8); + int v11_low = v11 & 255; + v11_low = v11_low >= 128 ? v11_low - 256 : v11_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 7 * v_tm_step, v11_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 7 * v_tm_step, (v11 - v11_low) >> 8); + int v12_low = v12 & 255; + v12_low = v12_low >= 128 ? v12_low - 256 : v12_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 8 * v_tm_step, v12_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 8 * v_tm_step, (v12 - v12_low) >> 8); + int v13_low = v13 & 255; + v13_low = v13_low >= 128 ? v13_low - 256 : v13_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 9 * v_tm_step, v13_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 9 * v_tm_step, (v13 - v13_low) >> 8); + int v14_low = v14 & 255; + v14_low = v14_low >= 128 ? v14_low - 256 : v14_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 10 * v_tm_step, v14_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 10 * v_tm_step, (v14 - v14_low) >> 8); + int v15_low = v15 & 255; + v15_low = v15_low >= 128 ? v15_low - 256 : v15_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 11 * v_tm_step, v15_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 11 * v_tm_step, (v15 - v15_low) >> 8); + int v20_low = v20 & 255; + v20_low = v20_low >= 128 ? v20_low - 256 : v20_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 12 * v_tm_step, v20_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 12 * v_tm_step, (v20 - v20_low) >> 8); + int v21_low = v21 & 255; + v21_low = v21_low >= 128 ? v21_low - 256 : v21_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 13 * v_tm_step, v21_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 13 * v_tm_step, (v21 - v21_low) >> 8); + int v22_low = v22 & 255; + v22_low = v22_low >= 128 ? v22_low - 256 : v22_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 14 * v_tm_step, v22_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 14 * v_tm_step, (v22 - v22_low) >> 8); + int v23_low = v23 & 255; + v23_low = v23_low >= 128 ? v23_low - 256 : v23_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 15 * v_tm_step, v23_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 15 * v_tm_step, (v23 - v23_low) >> 8); + int v24_low = v24 & 255; + v24_low = v24_low >= 128 ? v24_low - 256 : v24_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 16 * v_tm_step, v24_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 16 * v_tm_step, (v24 - v24_low) >> 8); + int v25_low = v25 & 255; + v25_low = v25_low >= 128 ? v25_low - 256 : v25_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 17 * v_tm_step, v25_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 17 * v_tm_step, (v25 - v25_low) >> 8); + int v30_low = v30 & 255; + v30_low = v30_low >= 128 ? v30_low - 256 : v30_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 18 * v_tm_step, v30_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 18 * v_tm_step, (v30 - v30_low) >> 8); + int v31_low = v31 & 255; + v31_low = v31_low >= 128 ? v31_low - 256 : v31_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 19 * v_tm_step, v31_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 19 * v_tm_step, (v31 - v31_low) >> 8); + int v32_low = v32 & 255; + v32_low = v32_low >= 128 ? v32_low - 256 : v32_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 20 * v_tm_step, v32_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 20 * v_tm_step, (v32 - v32_low) >> 8); + int v33_low = v33 & 255; + v33_low = v33_low >= 128 ? v33_low - 256 : v33_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 21 * v_tm_step, v33_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 21 * v_tm_step, (v33 - v33_low) >> 8); + int v34_low = v34 & 255; + v34_low = v34_low >= 128 ? v34_low - 256 : v34_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 22 * v_tm_step, v34_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 22 * v_tm_step, (v34 - v34_low) >> 8); + int v35_low = v35 & 255; + v35_low = v35_low >= 128 ? v35_low - 256 : v35_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 23 * v_tm_step, v35_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 23 * v_tm_step, (v35 - v35_low) >> 8); + int v40_low = v40 & 255; + v40_low = v40_low >= 128 ? v40_low - 256 : v40_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 24 * v_tm_step, v40_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 24 * v_tm_step, (v40 - v40_low) >> 8); + int v41_low = v41 & 255; + v41_low = v41_low >= 128 ? v41_low - 256 : v41_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 25 * v_tm_step, v41_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 25 * v_tm_step, (v41 - v41_low) >> 8); + int v42_low = v42 & 255; + v42_low = v42_low >= 128 ? v42_low - 256 : v42_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 26 * v_tm_step, v42_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 26 * v_tm_step, (v42 - v42_low) >> 8); + int v43_low = v43 & 255; + v43_low = v43_low >= 128 ? v43_low - 256 : v43_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 27 * v_tm_step, v43_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 27 * v_tm_step, (v43 - v43_low) >> 8); + int v44_low = v44 & 255; + v44_low = v44_low >= 128 ? v44_low - 256 : v44_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 28 * v_tm_step, v44_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 28 * v_tm_step, (v44 - v44_low) >> 8); + int v45_low = v45 & 255; + v45_low = v45_low >= 128 ? v45_low - 256 : v45_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 29 * v_tm_step, v45_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 29 * v_tm_step, (v45 - v45_low) >> 8); + int v50_low = v50 & 255; + v50_low = v50_low >= 128 ? v50_low - 256 : v50_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 30 * v_tm_step, v50_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 30 * v_tm_step, (v50 - v50_low) >> 8); + int v51_low = v51 & 255; + v51_low = v51_low >= 128 ? v51_low - 256 : v51_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 31 * v_tm_step, v51_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 31 * v_tm_step, (v51 - v51_low) >> 8); + int v52_low = v52 & 255; + v52_low = v52_low >= 128 ? v52_low - 256 : v52_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 32 * v_tm_step, v52_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 32 * v_tm_step, (v52 - v52_low) >> 8); + int v53_low = v53 & 255; + v53_low = v53_low >= 128 ? v53_low - 256 : v53_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 33 * v_tm_step, v53_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 33 * v_tm_step, (v53 - v53_low) >> 8); + int v54_low = v54 & 255; + v54_low = v54_low >= 128 ? v54_low - 256 : v54_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 34 * v_tm_step, v54_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 34 * v_tm_step, (v54 - v54_low) >> 8); + int v55_low = v55 & 255; + v55_low = v55_low >= 128 ? v55_low - 256 : v55_low; + i8buffer_st1(bottom_tm_low_data, v_tm_offset + 35 * v_tm_step, v55_low); + i8buffer_st1(bottom_tm_high_data, v_tm_offset + 35 * v_tm_step, (v55 - v55_low) >> 8); +} diff --git a/src/layer/vulkan/shader/convolution_3x3s1d1_winograd43_transform_output_int8.comp b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd43_transform_output_int8.comp new file mode 100644 index 000000000000..184b4da67d67 --- /dev/null +++ b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd43_transform_output_int8.comp @@ -0,0 +1,303 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#extension GL_GOOGLE_include_directive : enable +#include "vulkan_activation.comp" + +layout(constant_id = 0) const int bias_term = 0; +layout(constant_id = 1) const int activation_type = 0; +layout(constant_id = 2) const float activation_param_0 = 0; +layout(constant_id = 3) const float activation_param_1 = 0; +layout(constant_id = 4) const int use_int8_requantize = 0; +layout(binding = 0) readonly buffer top_tm_blob { int top_tm_blob_data[]; }; +layout(binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +layout(binding = 2) readonly buffer bias_blob { float bias_data[]; }; +layout(binding = 3) readonly buffer weight_descales_blob { float weight_descales_data[]; }; +layout(binding = 4) readonly buffer top_scales_blob { sfp top_scales_data[]; }; +layout(binding = 5) writeonly buffer top_blob_int8 { sint8 top_blob_int8_data[]; }; + +layout(push_constant) uniform parameter +{ + int cstep; + + int block_x; + int block_y; + + int outw; + int outh; + int outcstep; + int outc; +} p; + +void main() +{ + const int gx = int(gl_GlobalInvocationID.x); + const int gy = int(gl_GlobalInvocationID.y); + const int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.block_x || gy >= p.block_y || gz >= p.outc) + return; + + // load 36 + int v_tm_offset = gz * p.cstep + gy * p.block_x + gx; + int v_tm_step = p.cstep * p.outc; + + int v00 = top_tm_blob_data[v_tm_offset + 0 * v_tm_step]; + int v01 = top_tm_blob_data[v_tm_offset + 1 * v_tm_step]; + int v02 = top_tm_blob_data[v_tm_offset + 2 * v_tm_step]; + int v03 = top_tm_blob_data[v_tm_offset + 3 * v_tm_step]; + int v04 = top_tm_blob_data[v_tm_offset + 4 * v_tm_step]; + int v05 = top_tm_blob_data[v_tm_offset + 5 * v_tm_step]; + int v10 = top_tm_blob_data[v_tm_offset + 6 * v_tm_step]; + int v11 = top_tm_blob_data[v_tm_offset + 7 * v_tm_step]; + int v12 = top_tm_blob_data[v_tm_offset + 8 * v_tm_step]; + int v13 = top_tm_blob_data[v_tm_offset + 9 * v_tm_step]; + int v14 = top_tm_blob_data[v_tm_offset + 10 * v_tm_step]; + int v15 = top_tm_blob_data[v_tm_offset + 11 * v_tm_step]; + int v20 = top_tm_blob_data[v_tm_offset + 12 * v_tm_step]; + int v21 = top_tm_blob_data[v_tm_offset + 13 * v_tm_step]; + int v22 = top_tm_blob_data[v_tm_offset + 14 * v_tm_step]; + int v23 = top_tm_blob_data[v_tm_offset + 15 * v_tm_step]; + int v24 = top_tm_blob_data[v_tm_offset + 16 * v_tm_step]; + int v25 = top_tm_blob_data[v_tm_offset + 17 * v_tm_step]; + int v30 = top_tm_blob_data[v_tm_offset + 18 * v_tm_step]; + int v31 = top_tm_blob_data[v_tm_offset + 19 * v_tm_step]; + int v32 = top_tm_blob_data[v_tm_offset + 20 * v_tm_step]; + int v33 = top_tm_blob_data[v_tm_offset + 21 * v_tm_step]; + int v34 = top_tm_blob_data[v_tm_offset + 22 * v_tm_step]; + int v35 = top_tm_blob_data[v_tm_offset + 23 * v_tm_step]; + int v40 = top_tm_blob_data[v_tm_offset + 24 * v_tm_step]; + int v41 = top_tm_blob_data[v_tm_offset + 25 * v_tm_step]; + int v42 = top_tm_blob_data[v_tm_offset + 26 * v_tm_step]; + int v43 = top_tm_blob_data[v_tm_offset + 27 * v_tm_step]; + int v44 = top_tm_blob_data[v_tm_offset + 28 * v_tm_step]; + int v45 = top_tm_blob_data[v_tm_offset + 29 * v_tm_step]; + int v50 = top_tm_blob_data[v_tm_offset + 30 * v_tm_step]; + int v51 = top_tm_blob_data[v_tm_offset + 31 * v_tm_step]; + int v52 = top_tm_blob_data[v_tm_offset + 32 * v_tm_step]; + int v53 = top_tm_blob_data[v_tm_offset + 33 * v_tm_step]; + int v54 = top_tm_blob_data[v_tm_offset + 34 * v_tm_step]; + int v55 = top_tm_blob_data[v_tm_offset + 35 * v_tm_step]; + + // implicit transpose + int m00 = v00 + v01 + v02 + v03 + v04; + int m01 = v10 + v11 + v12 + v13 + v14; + int m02 = v20 + v21 + v22 + v23 + v24; + int m03 = v30 + v31 + v32 + v33 + v34; + int m04 = v40 + v41 + v42 + v43 + v44; + int m05 = (v50 + v51 + v52 + v53 + v54) * 4; + + int m10 = (v01 - v02) + (v03 - v04) * 2; + int m11 = (v11 - v12) + (v13 - v14) * 2; + int m12 = (v21 - v22) + (v23 - v24) * 2; + int m13 = (v31 - v32) + (v33 - v34) * 2; + int m14 = (v41 - v42) + (v43 - v44) * 2; + int m15 = ((v51 - v52) + (v53 - v54) * 2) * 4; + + int m20 = (v01 + v02) + (v03 + v04) * 4; + int m21 = (v11 + v12) + (v13 + v14) * 4; + int m22 = (v21 + v22) + (v23 + v24) * 4; + int m23 = (v31 + v32) + (v33 + v34) * 4; + int m24 = (v41 + v42) + (v43 + v44) * 4; + int m25 = ((v51 + v52) + (v53 + v54) * 4) * 4; + + int m30 = (v01 - v02) + (v03 - v04) * 8 + v05 * 4; + int m31 = (v11 - v12) + (v13 - v14) * 8 + v15 * 4; + int m32 = (v21 - v22) + (v23 - v24) * 8 + v25 * 4; + int m33 = (v31 - v32) + (v33 - v34) * 8 + v35 * 4; + int m34 = (v41 - v42) + (v43 - v44) * 8 + v45 * 4; + int m35 = ((v51 - v52) + (v53 - v54) * 8 + v55 * 4) * 4; + + v00 = m00 + m01 + m02 + m03 + m04; + v10 = m10 + m11 + m12 + m13 + m14; + v20 = m20 + m21 + m22 + m23 + m24; + v30 = m30 + m31 + m32 + m33 + m34; + + v01 = (m01 - m02) + (m03 - m04) * 2; + v11 = (m11 - m12) + (m13 - m14) * 2; + v21 = (m21 - m22) + (m23 - m24) * 2; + v31 = (m31 - m32) + (m33 - m34) * 2; + + v02 = (m01 + m02) + (m03 + m04) * 4; + v12 = (m11 + m12) + (m13 + m14) * 4; + v22 = (m21 + m22) + (m23 + m24) * 4; + v32 = (m31 + m32) + (m33 + m34) * 4; + + v03 = (m01 - m02) + (m03 - m04) * 8 + m05; + v13 = (m11 - m12) + (m13 - m14) * 8 + m15; + v23 = (m21 - m22) + (m23 - m24) * 8 + m25; + v33 = (m31 - m32) + (m33 - m34) * 8 + m35; + + const float descale = weight_descales_data[gz] * (1.f / 576.f); + + float out00 = float(v00) * descale; + float out01 = float(v01) * descale; + float out02 = float(v02) * descale; + float out03 = float(v03) * descale; + float out10 = float(v10) * descale; + float out11 = float(v11) * descale; + float out12 = float(v12) * descale; + float out13 = float(v13) * descale; + float out20 = float(v20) * descale; + float out21 = float(v21) * descale; + float out22 = float(v22) * descale; + float out23 = float(v23) * descale; + float out30 = float(v30) * descale; + float out31 = float(v31) * descale; + float out32 = float(v32) * descale; + float out33 = float(v33) * descale; + + if (bias_term == 1) + { + const float bias_value = bias_data[gz]; + + out00 += bias_value; + out01 += bias_value; + out02 += bias_value; + out03 += bias_value; + out10 += bias_value; + out11 += bias_value; + out12 += bias_value; + out13 += bias_value; + out20 += bias_value; + out21 += bias_value; + out22 += bias_value; + out23 += bias_value; + out30 += bias_value; + out31 += bias_value; + out32 += bias_value; + out33 += bias_value; + } + + out00 = float(activation_afp(afp(out00), activation_type, activation_param_0, activation_param_1)); + out01 = float(activation_afp(afp(out01), activation_type, activation_param_0, activation_param_1)); + out02 = float(activation_afp(afp(out02), activation_type, activation_param_0, activation_param_1)); + out03 = float(activation_afp(afp(out03), activation_type, activation_param_0, activation_param_1)); + out10 = float(activation_afp(afp(out10), activation_type, activation_param_0, activation_param_1)); + out11 = float(activation_afp(afp(out11), activation_type, activation_param_0, activation_param_1)); + out12 = float(activation_afp(afp(out12), activation_type, activation_param_0, activation_param_1)); + out13 = float(activation_afp(afp(out13), activation_type, activation_param_0, activation_param_1)); + out20 = float(activation_afp(afp(out20), activation_type, activation_param_0, activation_param_1)); + out21 = float(activation_afp(afp(out21), activation_type, activation_param_0, activation_param_1)); + out22 = float(activation_afp(afp(out22), activation_type, activation_param_0, activation_param_1)); + out23 = float(activation_afp(afp(out23), activation_type, activation_param_0, activation_param_1)); + out30 = float(activation_afp(afp(out30), activation_type, activation_param_0, activation_param_1)); + out31 = float(activation_afp(afp(out31), activation_type, activation_param_0, activation_param_1)); + out32 = float(activation_afp(afp(out32), activation_type, activation_param_0, activation_param_1)); + out33 = float(activation_afp(afp(out33), activation_type, activation_param_0, activation_param_1)); + + // store 4x4 + int x = gx * 4; + int y = gy * 4; + + if (use_int8_requantize == 1) + { + const float top_scale = buffer_ld1(top_scales_data, 0); + + out00 *= top_scale; + out01 *= top_scale; + out02 *= top_scale; + out03 *= top_scale; + out10 *= top_scale; + out11 *= top_scale; + out12 *= top_scale; + out13 *= top_scale; + out20 *= top_scale; + out21 *= top_scale; + out22 *= top_scale; + out23 *= top_scale; + out30 *= top_scale; + out31 *= top_scale; + out32 *= top_scale; + out33 *= top_scale; + + int out00_int8 = float2int8(out00); + int out01_int8 = float2int8(out01); + int out02_int8 = float2int8(out02); + int out03_int8 = float2int8(out03); + int out10_int8 = float2int8(out10); + int out11_int8 = float2int8(out11); + int out12_int8 = float2int8(out12); + int out13_int8 = float2int8(out13); + int out20_int8 = float2int8(out20); + int out21_int8 = float2int8(out21); + int out22_int8 = float2int8(out22); + int out23_int8 = float2int8(out23); + int out30_int8 = float2int8(out30); + int out31_int8 = float2int8(out31); + int out32_int8 = float2int8(out32); + int out33_int8 = float2int8(out33); + + int v_offset0 = gz * p.outcstep + y * p.outw + x; + int v_offset1 = v_offset0 + p.outw; + int v_offset2 = v_offset1 + p.outw; + int v_offset3 = v_offset2 + p.outw; + + i8buffer_st1(top_blob_int8_data, v_offset0 + 0, out00_int8); + if (x + 1 < p.outw) i8buffer_st1(top_blob_int8_data, v_offset0 + 1, out01_int8); + if (x + 2 < p.outw) i8buffer_st1(top_blob_int8_data, v_offset0 + 2, out02_int8); + if (x + 3 < p.outw) i8buffer_st1(top_blob_int8_data, v_offset0 + 3, out03_int8); + + if (y + 1 < p.outh) + { + i8buffer_st1(top_blob_int8_data, v_offset1 + 0, out10_int8); + if (x + 1 < p.outw) i8buffer_st1(top_blob_int8_data, v_offset1 + 1, out11_int8); + if (x + 2 < p.outw) i8buffer_st1(top_blob_int8_data, v_offset1 + 2, out12_int8); + if (x + 3 < p.outw) i8buffer_st1(top_blob_int8_data, v_offset1 + 3, out13_int8); + } + + if (y + 2 < p.outh) + { + i8buffer_st1(top_blob_int8_data, v_offset2 + 0, out20_int8); + if (x + 1 < p.outw) i8buffer_st1(top_blob_int8_data, v_offset2 + 1, out21_int8); + if (x + 2 < p.outw) i8buffer_st1(top_blob_int8_data, v_offset2 + 2, out22_int8); + if (x + 3 < p.outw) i8buffer_st1(top_blob_int8_data, v_offset2 + 3, out23_int8); + } + + if (y + 3 < p.outh) + { + i8buffer_st1(top_blob_int8_data, v_offset3 + 0, out30_int8); + if (x + 1 < p.outw) i8buffer_st1(top_blob_int8_data, v_offset3 + 1, out31_int8); + if (x + 2 < p.outw) i8buffer_st1(top_blob_int8_data, v_offset3 + 2, out32_int8); + if (x + 3 < p.outw) i8buffer_st1(top_blob_int8_data, v_offset3 + 3, out33_int8); + } + } + else + { + int v_offset0 = gz * p.outcstep + y * p.outw + x; + int v_offset1 = v_offset0 + p.outw; + int v_offset2 = v_offset1 + p.outw; + int v_offset3 = v_offset2 + p.outw; + + buffer_st1(top_blob_data, v_offset0 + 0, afp(out00)); + if (x + 1 < p.outw) buffer_st1(top_blob_data, v_offset0 + 1, afp(out01)); + if (x + 2 < p.outw) buffer_st1(top_blob_data, v_offset0 + 2, afp(out02)); + if (x + 3 < p.outw) buffer_st1(top_blob_data, v_offset0 + 3, afp(out03)); + + if (y + 1 < p.outh) + { + buffer_st1(top_blob_data, v_offset1 + 0, afp(out10)); + if (x + 1 < p.outw) buffer_st1(top_blob_data, v_offset1 + 1, afp(out11)); + if (x + 2 < p.outw) buffer_st1(top_blob_data, v_offset1 + 2, afp(out12)); + if (x + 3 < p.outw) buffer_st1(top_blob_data, v_offset1 + 3, afp(out13)); + } + + if (y + 2 < p.outh) + { + buffer_st1(top_blob_data, v_offset2 + 0, afp(out20)); + if (x + 1 < p.outw) buffer_st1(top_blob_data, v_offset2 + 1, afp(out21)); + if (x + 2 < p.outw) buffer_st1(top_blob_data, v_offset2 + 2, afp(out22)); + if (x + 3 < p.outw) buffer_st1(top_blob_data, v_offset2 + 3, afp(out23)); + } + + if (y + 3 < p.outh) + { + buffer_st1(top_blob_data, v_offset3 + 0, afp(out30)); + if (x + 1 < p.outw) buffer_st1(top_blob_data, v_offset3 + 1, afp(out31)); + if (x + 2 < p.outw) buffer_st1(top_blob_data, v_offset3 + 2, afp(out32)); + if (x + 3 < p.outw) buffer_st1(top_blob_data, v_offset3 + 3, afp(out33)); + } + } +} diff --git a/src/layer/vulkan/shader/convolution_3x3s1d1_winograd_gemm_int8.comp b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd_gemm_int8.comp new file mode 100644 index 000000000000..e083c947a2ae --- /dev/null +++ b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd_gemm_int8.comp @@ -0,0 +1,389 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && (NCNN_int16_storage || NCNN_int16_packed) && ncnn_shaderInt16 && ncnn_integerDotProduct16BitSignedAccelerated +#extension GL_EXT_integer_dot_product : require +#endif + +#define LOCAL_MEMORY_UNROLL_INCH 8 +#define LOCAL_MEMORY_UNROLL_INCH4 (LOCAL_MEMORY_UNROLL_INCH / 4) + +layout(constant_id = 0) const int batch = 1; +layout(constant_id = 1) const int c = 0; +layout(constant_id = 2) const int outc = 0; +layout(constant_id = 3) const int elempack = 1; +layout(constant_id = 4) const int out_elempack = 1; + +#define shape_constant_id_offset 5 +layout(constant_id = shape_constant_id_offset + 0) const int cstep = 0; +layout(constant_id = shape_constant_id_offset + 1) const int outw = 0; +layout(constant_id = shape_constant_id_offset + 2) const int outcstep = 0; + +layout(binding = 0) readonly buffer bottom_tm_blob { sint16 bottom_tm_blob_data[]; }; +layout(binding = 1) readonly buffer bottom_tm_blob_4 { sint16vec4 bottom_tm_blob_data_4[]; }; +layout(binding = 2) writeonly buffer top_tm_blob { int top_tm_blob_data[]; }; +layout(binding = 3) writeonly buffer top_tm_blob_4 { ivec4 top_tm_blob_data_4[]; }; +layout(binding = 4) readonly buffer weight_tm_blob { sint16vec4 weight_tm_data[]; }; + +layout(push_constant) uniform parameter +{ + int cstep; + int outw; + int outcstep; +} p; + +#if NCNN_shader_local_memory +shared lint16vec4 tmp_v[8][LOCAL_MEMORY_UNROLL_INCH4][4]; +shared lint16vec4 tmp_k[8][LOCAL_MEMORY_UNROLL_INCH4][4]; +#endif + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) * 4; + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + const int c4 = (c + 3) / 4; + const int outc4 = (outc + 3) / 4; + const int outc_aligned = outc4 * 4; + +#if !NCNN_shader_local_memory + if (gx >= psc(outw) || gy * 4 >= outc || gz >= batch) + return; +#endif + + ivec4 sum0 = ivec4(0); + ivec4 sum1 = ivec4(0); + ivec4 sum2 = ivec4(0); + ivec4 sum3 = ivec4(0); + +#if NCNN_shader_local_memory + const int lx = int(gl_LocalInvocationID.x); + const int ly = int(gl_LocalInvocationID.y); + + int q4 = 0; + for (; q4 + (LOCAL_MEMORY_UNROLL_INCH4 - 1) < c4; q4 += LOCAL_MEMORY_UNROLL_INCH4) + { + if (ly < 4) + { + const int pos = gx + ly; + if (pos < psc(outw)) + { + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH4; z4++) + { + if (elempack == 4) + { + tmp_v[lx][z4][ly] = i16buffer_sm4(bottom_tm_blob_data_4, gz * c4 * psc(cstep) + (q4 + z4) * psc(cstep) + pos); + } + else + { + const ivec4 q = min((q4 + z4) * 4 + ivec4(0, 1, 2, 3), ivec4(c - 1)); + const int v_offset = gz * c * psc(cstep) + pos; + i16buffer_st4(tmp_v[lx][z4], ly, ivec4(i16buffer_ld1(bottom_tm_blob_data, v_offset + q.r * psc(cstep)), i16buffer_ld1(bottom_tm_blob_data, v_offset + q.g * psc(cstep)), i16buffer_ld1(bottom_tm_blob_data, v_offset + q.b * psc(cstep)), i16buffer_ld1(bottom_tm_blob_data, v_offset + q.a * psc(cstep)))); + } + } + } + else + { + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH4; z4++) + { + i16buffer_st4(tmp_v[lx][z4], ly, ivec4(0)); + } + } + } + + if (lx < 4) + { + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH4; z4++) + { + if (gy * 4 + lx < outc_aligned) + tmp_k[ly][z4][lx] = i16buffer_sm4(weight_tm_data, gz * outc_aligned * c4 + (gy * 4 + lx) * c4 + q4 + z4); + else + i16buffer_st4(tmp_k[ly][z4], lx, ivec4(0)); + } + } + + barrier(); + + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH4; z4++) + { + const aint16vec4 v0 = lint162aint16vec4(tmp_v[lx][z4][0]); + const aint16vec4 v1 = lint162aint16vec4(tmp_v[lx][z4][1]); + const aint16vec4 v2 = lint162aint16vec4(tmp_v[lx][z4][2]); + const aint16vec4 v3 = lint162aint16vec4(tmp_v[lx][z4][3]); + + const aint16vec4 k0 = lint162aint16vec4(tmp_k[ly][z4][0]); + const aint16vec4 k1 = lint162aint16vec4(tmp_k[ly][z4][1]); + const aint16vec4 k2 = lint162aint16vec4(tmp_k[ly][z4][2]); + const aint16vec4 k3 = lint162aint16vec4(tmp_k[ly][z4][3]); + +#if (NCNN_int16_storage || NCNN_int16_packed) && ncnn_shaderInt16 && ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct16BitSignedAccelerated + sum0 += ivec4(dotEXT(v0, k0), dotEXT(v0, k1), dotEXT(v0, k2), dotEXT(v0, k3)); + sum1 += ivec4(dotEXT(v1, k0), dotEXT(v1, k1), dotEXT(v1, k2), dotEXT(v1, k3)); + sum2 += ivec4(dotEXT(v2, k0), dotEXT(v2, k1), dotEXT(v2, k2), dotEXT(v2, k3)); + sum3 += ivec4(dotEXT(v3, k0), dotEXT(v3, k1), dotEXT(v3, k2), dotEXT(v3, k3)); +#else + const ivec4 v0i = ivec4(v0); + const ivec4 v1i = ivec4(v1); + const ivec4 v2i = ivec4(v2); + const ivec4 v3i = ivec4(v3); + const ivec4 k0i = ivec4(k0); + const ivec4 k1i = ivec4(k1); + const ivec4 k2i = ivec4(k2); + const ivec4 k3i = ivec4(k3); + + sum0.r += v0i.r * k0i.r + v0i.g * k0i.g + v0i.b * k0i.b + v0i.a * k0i.a; + sum0.g += v0i.r * k1i.r + v0i.g * k1i.g + v0i.b * k1i.b + v0i.a * k1i.a; + sum0.b += v0i.r * k2i.r + v0i.g * k2i.g + v0i.b * k2i.b + v0i.a * k2i.a; + sum0.a += v0i.r * k3i.r + v0i.g * k3i.g + v0i.b * k3i.b + v0i.a * k3i.a; + + sum1.r += v1i.r * k0i.r + v1i.g * k0i.g + v1i.b * k0i.b + v1i.a * k0i.a; + sum1.g += v1i.r * k1i.r + v1i.g * k1i.g + v1i.b * k1i.b + v1i.a * k1i.a; + sum1.b += v1i.r * k2i.r + v1i.g * k2i.g + v1i.b * k2i.b + v1i.a * k2i.a; + sum1.a += v1i.r * k3i.r + v1i.g * k3i.g + v1i.b * k3i.b + v1i.a * k3i.a; + + sum2.r += v2i.r * k0i.r + v2i.g * k0i.g + v2i.b * k0i.b + v2i.a * k0i.a; + sum2.g += v2i.r * k1i.r + v2i.g * k1i.g + v2i.b * k1i.b + v2i.a * k1i.a; + sum2.b += v2i.r * k2i.r + v2i.g * k2i.g + v2i.b * k2i.b + v2i.a * k2i.a; + sum2.a += v2i.r * k3i.r + v2i.g * k3i.g + v2i.b * k3i.b + v2i.a * k3i.a; + + sum3.r += v3i.r * k0i.r + v3i.g * k0i.g + v3i.b * k0i.b + v3i.a * k0i.a; + sum3.g += v3i.r * k1i.r + v3i.g * k1i.g + v3i.b * k1i.b + v3i.a * k1i.a; + sum3.b += v3i.r * k2i.r + v3i.g * k2i.g + v3i.b * k2i.b + v3i.a * k2i.a; + sum3.a += v3i.r * k3i.r + v3i.g * k3i.g + v3i.b * k3i.b + v3i.a * k3i.a; +#endif + } + + barrier(); + } + + if (q4 < c4) + { + const int remain = c4 - q4; + + if (ly < 4) + { + const int pos = gx + ly; + if (pos < psc(outw)) + { + for (int z4 = 0; z4 < remain; z4++) + { + if (elempack == 4) + { + tmp_v[lx][z4][ly] = i16buffer_sm4(bottom_tm_blob_data_4, gz * c4 * psc(cstep) + (q4 + z4) * psc(cstep) + pos); + } + else + { + const ivec4 q = min((q4 + z4) * 4 + ivec4(0, 1, 2, 3), ivec4(c - 1)); + const int v_offset = gz * c * psc(cstep) + pos; + i16buffer_st4(tmp_v[lx][z4], ly, ivec4(i16buffer_ld1(bottom_tm_blob_data, v_offset + q.r * psc(cstep)), i16buffer_ld1(bottom_tm_blob_data, v_offset + q.g * psc(cstep)), i16buffer_ld1(bottom_tm_blob_data, v_offset + q.b * psc(cstep)), i16buffer_ld1(bottom_tm_blob_data, v_offset + q.a * psc(cstep)))); + } + } + } + else + { + for (int z4 = 0; z4 < remain; z4++) + { + i16buffer_st4(tmp_v[lx][z4], ly, ivec4(0)); + } + } + } + + if (lx < 4) + { + for (int z4 = 0; z4 < remain; z4++) + { + if (gy * 4 + lx < outc_aligned) + tmp_k[ly][z4][lx] = i16buffer_sm4(weight_tm_data, gz * outc_aligned * c4 + (gy * 4 + lx) * c4 + q4 + z4); + else + i16buffer_st4(tmp_k[ly][z4], lx, ivec4(0)); + } + } + + barrier(); + + for (int z4 = 0; z4 < remain; z4++) + { + const aint16vec4 v0 = lint162aint16vec4(tmp_v[lx][z4][0]); + const aint16vec4 v1 = lint162aint16vec4(tmp_v[lx][z4][1]); + const aint16vec4 v2 = lint162aint16vec4(tmp_v[lx][z4][2]); + const aint16vec4 v3 = lint162aint16vec4(tmp_v[lx][z4][3]); + + const aint16vec4 k0 = lint162aint16vec4(tmp_k[ly][z4][0]); + const aint16vec4 k1 = lint162aint16vec4(tmp_k[ly][z4][1]); + const aint16vec4 k2 = lint162aint16vec4(tmp_k[ly][z4][2]); + const aint16vec4 k3 = lint162aint16vec4(tmp_k[ly][z4][3]); + +#if (NCNN_int16_storage || NCNN_int16_packed) && ncnn_shaderInt16 && ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct16BitSignedAccelerated + sum0 += ivec4(dotEXT(v0, k0), dotEXT(v0, k1), dotEXT(v0, k2), dotEXT(v0, k3)); + sum1 += ivec4(dotEXT(v1, k0), dotEXT(v1, k1), dotEXT(v1, k2), dotEXT(v1, k3)); + sum2 += ivec4(dotEXT(v2, k0), dotEXT(v2, k1), dotEXT(v2, k2), dotEXT(v2, k3)); + sum3 += ivec4(dotEXT(v3, k0), dotEXT(v3, k1), dotEXT(v3, k2), dotEXT(v3, k3)); +#else + const ivec4 v0i = ivec4(v0); + const ivec4 v1i = ivec4(v1); + const ivec4 v2i = ivec4(v2); + const ivec4 v3i = ivec4(v3); + const ivec4 k0i = ivec4(k0); + const ivec4 k1i = ivec4(k1); + const ivec4 k2i = ivec4(k2); + const ivec4 k3i = ivec4(k3); + + sum0.r += v0i.r * k0i.r + v0i.g * k0i.g + v0i.b * k0i.b + v0i.a * k0i.a; + sum0.g += v0i.r * k1i.r + v0i.g * k1i.g + v0i.b * k1i.b + v0i.a * k1i.a; + sum0.b += v0i.r * k2i.r + v0i.g * k2i.g + v0i.b * k2i.b + v0i.a * k2i.a; + sum0.a += v0i.r * k3i.r + v0i.g * k3i.g + v0i.b * k3i.b + v0i.a * k3i.a; + + sum1.r += v1i.r * k0i.r + v1i.g * k0i.g + v1i.b * k0i.b + v1i.a * k0i.a; + sum1.g += v1i.r * k1i.r + v1i.g * k1i.g + v1i.b * k1i.b + v1i.a * k1i.a; + sum1.b += v1i.r * k2i.r + v1i.g * k2i.g + v1i.b * k2i.b + v1i.a * k2i.a; + sum1.a += v1i.r * k3i.r + v1i.g * k3i.g + v1i.b * k3i.b + v1i.a * k3i.a; + + sum2.r += v2i.r * k0i.r + v2i.g * k0i.g + v2i.b * k0i.b + v2i.a * k0i.a; + sum2.g += v2i.r * k1i.r + v2i.g * k1i.g + v2i.b * k1i.b + v2i.a * k1i.a; + sum2.b += v2i.r * k2i.r + v2i.g * k2i.g + v2i.b * k2i.b + v2i.a * k2i.a; + sum2.a += v2i.r * k3i.r + v2i.g * k3i.g + v2i.b * k3i.b + v2i.a * k3i.a; + + sum3.r += v3i.r * k0i.r + v3i.g * k0i.g + v3i.b * k0i.b + v3i.a * k0i.a; + sum3.g += v3i.r * k1i.r + v3i.g * k1i.g + v3i.b * k1i.b + v3i.a * k1i.a; + sum3.b += v3i.r * k2i.r + v3i.g * k2i.g + v3i.b * k2i.b + v3i.a * k2i.a; + sum3.a += v3i.r * k3i.r + v3i.g * k3i.g + v3i.b * k3i.b + v3i.a * k3i.a; +#endif + } + } +#else + for (int q4 = 0; q4 < c4; q4++) + { + aint16vec4 v0; + aint16vec4 v1; + aint16vec4 v2; + aint16vec4 v3; + + if (elempack == 4) + { + const int v_offset = gz * c4 * psc(cstep) + q4 * psc(cstep) + gx; + v0 = i16buffer_ld4(bottom_tm_blob_data_4, v_offset + 0); + v1 = i16buffer_ld4(bottom_tm_blob_data_4, v_offset + 1); + v2 = i16buffer_ld4(bottom_tm_blob_data_4, v_offset + 2); + v3 = i16buffer_ld4(bottom_tm_blob_data_4, v_offset + 3); + } + else + { + const ivec4 q = min(q4 * 4 + ivec4(0, 1, 2, 3), ivec4(c - 1)); + const int v_offset = gz * c * psc(cstep) + gx; + v0 = aint16vec4(ivec4( + i16buffer_ld1(bottom_tm_blob_data, v_offset + q.r * psc(cstep) + 0), + i16buffer_ld1(bottom_tm_blob_data, v_offset + q.g * psc(cstep) + 0), + i16buffer_ld1(bottom_tm_blob_data, v_offset + q.b * psc(cstep) + 0), + i16buffer_ld1(bottom_tm_blob_data, v_offset + q.a * psc(cstep) + 0))); + v1 = aint16vec4(ivec4( + i16buffer_ld1(bottom_tm_blob_data, v_offset + q.r * psc(cstep) + 1), + i16buffer_ld1(bottom_tm_blob_data, v_offset + q.g * psc(cstep) + 1), + i16buffer_ld1(bottom_tm_blob_data, v_offset + q.b * psc(cstep) + 1), + i16buffer_ld1(bottom_tm_blob_data, v_offset + q.a * psc(cstep) + 1))); + v2 = aint16vec4(ivec4( + i16buffer_ld1(bottom_tm_blob_data, v_offset + q.r * psc(cstep) + 2), + i16buffer_ld1(bottom_tm_blob_data, v_offset + q.g * psc(cstep) + 2), + i16buffer_ld1(bottom_tm_blob_data, v_offset + q.b * psc(cstep) + 2), + i16buffer_ld1(bottom_tm_blob_data, v_offset + q.a * psc(cstep) + 2))); + v3 = aint16vec4(ivec4( + i16buffer_ld1(bottom_tm_blob_data, v_offset + q.r * psc(cstep) + 3), + i16buffer_ld1(bottom_tm_blob_data, v_offset + q.g * psc(cstep) + 3), + i16buffer_ld1(bottom_tm_blob_data, v_offset + q.b * psc(cstep) + 3), + i16buffer_ld1(bottom_tm_blob_data, v_offset + q.a * psc(cstep) + 3))); + } + + const int w_offset = gz * outc_aligned * c4 + gy * 4 * c4 + q4; + const aint16vec4 k0 = i16buffer_ld4(weight_tm_data, w_offset + 0 * c4); + const aint16vec4 k1 = i16buffer_ld4(weight_tm_data, w_offset + 1 * c4); + const aint16vec4 k2 = i16buffer_ld4(weight_tm_data, w_offset + 2 * c4); + const aint16vec4 k3 = i16buffer_ld4(weight_tm_data, w_offset + 3 * c4); + +#if (NCNN_int16_storage || NCNN_int16_packed) && ncnn_shaderInt16 && ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct16BitSignedAccelerated + sum0 += ivec4(dotEXT(v0, k0), dotEXT(v0, k1), dotEXT(v0, k2), dotEXT(v0, k3)); + sum1 += ivec4(dotEXT(v1, k0), dotEXT(v1, k1), dotEXT(v1, k2), dotEXT(v1, k3)); + sum2 += ivec4(dotEXT(v2, k0), dotEXT(v2, k1), dotEXT(v2, k2), dotEXT(v2, k3)); + sum3 += ivec4(dotEXT(v3, k0), dotEXT(v3, k1), dotEXT(v3, k2), dotEXT(v3, k3)); +#else + const ivec4 v0i = ivec4(v0); + const ivec4 v1i = ivec4(v1); + const ivec4 v2i = ivec4(v2); + const ivec4 v3i = ivec4(v3); + const ivec4 k0i = ivec4(k0); + const ivec4 k1i = ivec4(k1); + const ivec4 k2i = ivec4(k2); + const ivec4 k3i = ivec4(k3); + + sum0.r += v0i.r * k0i.r + v0i.g * k0i.g + v0i.b * k0i.b + v0i.a * k0i.a; + sum0.g += v0i.r * k1i.r + v0i.g * k1i.g + v0i.b * k1i.b + v0i.a * k1i.a; + sum0.b += v0i.r * k2i.r + v0i.g * k2i.g + v0i.b * k2i.b + v0i.a * k2i.a; + sum0.a += v0i.r * k3i.r + v0i.g * k3i.g + v0i.b * k3i.b + v0i.a * k3i.a; + + sum1.r += v1i.r * k0i.r + v1i.g * k0i.g + v1i.b * k0i.b + v1i.a * k0i.a; + sum1.g += v1i.r * k1i.r + v1i.g * k1i.g + v1i.b * k1i.b + v1i.a * k1i.a; + sum1.b += v1i.r * k2i.r + v1i.g * k2i.g + v1i.b * k2i.b + v1i.a * k2i.a; + sum1.a += v1i.r * k3i.r + v1i.g * k3i.g + v1i.b * k3i.b + v1i.a * k3i.a; + + sum2.r += v2i.r * k0i.r + v2i.g * k0i.g + v2i.b * k0i.b + v2i.a * k0i.a; + sum2.g += v2i.r * k1i.r + v2i.g * k1i.g + v2i.b * k1i.b + v2i.a * k1i.a; + sum2.b += v2i.r * k2i.r + v2i.g * k2i.g + v2i.b * k2i.b + v2i.a * k2i.a; + sum2.a += v2i.r * k3i.r + v2i.g * k3i.g + v2i.b * k3i.b + v2i.a * k3i.a; + + sum3.r += v3i.r * k0i.r + v3i.g * k0i.g + v3i.b * k0i.b + v3i.a * k0i.a; + sum3.g += v3i.r * k1i.r + v3i.g * k1i.g + v3i.b * k1i.b + v3i.a * k1i.a; + sum3.b += v3i.r * k2i.r + v3i.g * k2i.g + v3i.b * k2i.b + v3i.a * k2i.a; + sum3.a += v3i.r * k3i.r + v3i.g * k3i.g + v3i.b * k3i.b + v3i.a * k3i.a; +#endif + } +#endif + +#if NCNN_shader_local_memory + if (gx >= psc(outw) || gy * 4 >= outc || gz >= batch) + return; +#endif + + if (out_elempack == 4) + { + int gi = (gz * outc4 + gy) * psc(outcstep) + gx; + + top_tm_blob_data_4[gi + 0] = sum0; + if (gx + 1 < psc(outw)) top_tm_blob_data_4[gi + 1] = sum1; + if (gx + 2 < psc(outw)) top_tm_blob_data_4[gi + 2] = sum2; + if (gx + 3 < psc(outw)) top_tm_blob_data_4[gi + 3] = sum3; + } + else + { + int gi = (gz * outc + gy * 4) * psc(outcstep) + gx; + + top_tm_blob_data[gi + 0 * psc(outcstep) + 0] = sum0.r; + if (gy * 4 + 1 < outc) top_tm_blob_data[gi + 1 * psc(outcstep) + 0] = sum0.g; + if (gy * 4 + 2 < outc) top_tm_blob_data[gi + 2 * psc(outcstep) + 0] = sum0.b; + if (gy * 4 + 3 < outc) top_tm_blob_data[gi + 3 * psc(outcstep) + 0] = sum0.a; + + if (gx + 1 < psc(outw)) + { + top_tm_blob_data[gi + 0 * psc(outcstep) + 1] = sum1.r; + if (gy * 4 + 1 < outc) top_tm_blob_data[gi + 1 * psc(outcstep) + 1] = sum1.g; + if (gy * 4 + 2 < outc) top_tm_blob_data[gi + 2 * psc(outcstep) + 1] = sum1.b; + if (gy * 4 + 3 < outc) top_tm_blob_data[gi + 3 * psc(outcstep) + 1] = sum1.a; + } + + if (gx + 2 < psc(outw)) + { + top_tm_blob_data[gi + 0 * psc(outcstep) + 2] = sum2.r; + if (gy * 4 + 1 < outc) top_tm_blob_data[gi + 1 * psc(outcstep) + 2] = sum2.g; + if (gy * 4 + 2 < outc) top_tm_blob_data[gi + 2 * psc(outcstep) + 2] = sum2.b; + if (gy * 4 + 3 < outc) top_tm_blob_data[gi + 3 * psc(outcstep) + 2] = sum2.a; + } + + if (gx + 3 < psc(outw)) + { + top_tm_blob_data[gi + 0 * psc(outcstep) + 3] = sum3.r; + if (gy * 4 + 1 < outc) top_tm_blob_data[gi + 1 * psc(outcstep) + 3] = sum3.g; + if (gy * 4 + 2 < outc) top_tm_blob_data[gi + 2 * psc(outcstep) + 3] = sum3.b; + if (gy * 4 + 3 < outc) top_tm_blob_data[gi + 3 * psc(outcstep) + 3] = sum3.a; + } + } +} diff --git a/src/layer/vulkan/shader/convolution_3x3s1d1_winograd_gemm_int8_cm.comp b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd_gemm_int8_cm.comp new file mode 100644 index 000000000000..cbef1cf91451 --- /dev/null +++ b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd_gemm_int8_cm.comp @@ -0,0 +1,731 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#extension GL_EXT_control_flow_attributes : require + +#extension GL_KHR_shader_subgroup_basic : require + +#extension GL_KHR_memory_scope_semantics : require +#extension GL_EXT_shader_explicit_arithmetic_types : require +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require +#if ncnn_VK_KHR_cooperative_matrix +#extension GL_KHR_cooperative_matrix : require +#elif ncnn_VK_NV_cooperative_matrix +#extension GL_NV_cooperative_matrix : require +#extension GL_NV_integer_cooperative_matrix : require +#endif + +layout(constant_id = 0) const uint batch = 1; +layout(constant_id = 1) const uint M = 1; +layout(constant_id = 2) const uint N = 1; +layout(constant_id = 3) const uint K = 1; +layout(constant_id = 4) const uint UNROLL_SG_M = 2; +layout(constant_id = 5) const uint UNROLL_SG_N = 2; +layout(constant_id = 6) const uint UNROLL_SG_K = 2; +layout(constant_id = 7) const uint UNROLL_WG_M = 2; +layout(constant_id = 8) const uint UNROLL_WG_N = 2; +layout(constant_id = 9) const uint subgroup_size = 32; +layout(constant_id = 10) const uint inch = 1; +layout(constant_id = 11) const uint outch = 1; +layout(constant_id = 12) const uint elempack = 1; +layout(constant_id = 13) const uint out_elempack = 1; +layout(constant_id = 14) const uint wbstep = 0; + +#define shape_constant_id_offset 15 +layout(constant_id = shape_constant_id_offset + 0) const uint size = 0; +layout(constant_id = shape_constant_id_offset + 1) const uint cstep = 0; +layout(constant_id = shape_constant_id_offset + 2) const uint outcstep = 0; + +layout(binding = 0) readonly buffer bottom_tm_low_blob { sint8vec4 bottom_tm_low_data[]; }; +layout(binding = 1) readonly buffer bottom_tm_high_blob { sint8vec4 bottom_tm_high_data[]; }; +layout(binding = 2) readonly buffer bottom_tm_low_high_blob { ivec2 bottom_tm_low_high_data[]; }; +layout(binding = 3) writeonly buffer top_tm_blob { int top_tm_blob_data[]; }; +layout(binding = 4) writeonly buffer top_tm_blob_4 { ivec4 top_tm_blob_data_4[]; }; +layout(binding = 5) readonly buffer weight_tm_blob { ivec2 weight_tm_data[]; }; + +layout(push_constant) uniform parameter +{ + uint size; + uint cstep; + uint outcstep; +} p; + +const uint Md4 = M / 4; +const uint Nd4 = N / 4; +const uint Kd4 = K / 4; + +#if ncnn_VK_KHR_cooperative_matrix +#define PAD 1 +#elif ncnn_VK_NV_cooperative_matrix +#define PAD 0 +#endif + +const uint Kd4p = Kd4 + PAD; +const uint Nd4p = Nd4 + PAD; + +shared int tmp_v0[UNROLL_WG_M][UNROLL_SG_K * UNROLL_SG_M * M * Kd4p]; +shared int tmp_v1[UNROLL_WG_M][UNROLL_SG_K * UNROLL_SG_M * M * Kd4p]; +shared int tmp_k0[UNROLL_WG_N][UNROLL_SG_K * UNROLL_SG_N * K * Nd4p]; +shared int tmp_k1[UNROLL_WG_N][UNROLL_SG_K * UNROLL_SG_N * K * Nd4p]; +shared ivec4 tmp_o[UNROLL_WG_N * UNROLL_WG_M][UNROLL_SG_N * UNROLL_SG_M * M * N / 4]; + +void main() +{ + const uint gz = gl_GlobalInvocationID.z; + if (gz >= batch) + return; + + const uint wgi = gl_WorkGroupID.x; + const uint sgi = gl_SubgroupID; + + const uint wgmm = (psc(size) + M * UNROLL_SG_M * UNROLL_WG_M - 1) / (M * UNROLL_SG_M * UNROLL_WG_M); + const uint wgnn = (outch + N * UNROLL_SG_N * UNROLL_WG_N - 1) / (N * UNROLL_SG_N * UNROLL_WG_N); + + const uint wgmi = wgi / wgnn; + const uint wgni = wgi % wgnn; + + const uint sgmi = sgi / UNROLL_WG_N; + const uint sgni = sgi % UNROLL_WG_N; + + if (wgmi >= wgmm) + return; + + const uint kk = (inch + K - 1) / K; + const uint kk_padded = (kk + UNROLL_SG_K - 1) / UNROLL_SG_K * UNROLL_SG_K; + const uint si = gl_SubgroupInvocationID; + + const uint mi = (wgmi * UNROLL_WG_M + sgmi) * UNROLL_SG_M; + const uint ni = (wgni * UNROLL_WG_N + sgni) * UNROLL_SG_N; + +#if ncnn_VK_KHR_cooperative_matrix + coopmat sum[UNROLL_SG_N][UNROLL_SG_M]; + coopmat sum8[UNROLL_SG_N][UNROLL_SG_M]; + coopmat sum16[UNROLL_SG_N][UNROLL_SG_M]; +#elif ncnn_VK_NV_cooperative_matrix + icoopmatNV<32, gl_ScopeSubgroup, M, N> sum[UNROLL_SG_N][UNROLL_SG_M]; + icoopmatNV<32, gl_ScopeSubgroup, M, N> sum8[UNROLL_SG_N][UNROLL_SG_M]; + icoopmatNV<32, gl_ScopeSubgroup, M, N> sum16[UNROLL_SG_N][UNROLL_SG_M]; +#endif + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { +#if ncnn_VK_KHR_cooperative_matrix + sum[zn][zm] = coopmat(0); + sum8[zn][zm] = coopmat(0); + sum16[zn][zm] = coopmat(0); +#elif ncnn_VK_NV_cooperative_matrix + sum[zn][zm] = icoopmatNV<32, gl_ScopeSubgroup, M, N>(0); + sum8[zn][zm] = icoopmatNV<32, gl_ScopeSubgroup, M, N>(0); + sum16[zn][zm] = icoopmatNV<32, gl_ScopeSubgroup, M, N>(0); +#endif + } + } + + if (kk >= UNROLL_SG_K * 2) + { + // local stack and shared memory ping-pong + + const uint Md4_K_USGM = Md4 * K * UNROLL_SG_M; + const uint Md4_K_USGM_USGK = Md4_K_USGM * UNROLL_SG_K; + const uint M_Kd4p_USGM = M * Kd4p * UNROLL_SG_M; + const uint M_Kd4p_USGM_USGK = M_Kd4p_USGM * UNROLL_SG_K; + const uint V_USGM_USGK = elempack == 1 ? Md4_K_USGM_USGK : M_Kd4p_USGM_USGK; + const uint V_USGM_USGK_d_subgroupsize = (V_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N); + const uint K_Nd4p = K * Nd4p; + const uint K_Nd4p_USGN = K_Nd4p * UNROLL_SG_N; + const uint K_Nd4p_USGN_USGK = K_Nd4p_USGN * UNROLL_SG_K; + const uint K_Nd4p_USGN_USGK_d_subgroupsize = (K_Nd4p_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M); + + ivec2 prefetch_tmp_v[(M_Kd4p_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N)]; + ivec2 prefetch_tmp_k[(K_Nd4p_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M)]; + + // prefetch the very first + { + const uint ki = 0; + + if (elempack == 1) + { + const uint cstepd4 = psc(cstep) / 4; + + [[unroll]] for (uint q = 0; q < V_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (V_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < V_USGM_USGK) + { + const uint zk = siq / Md4_K_USGM; + const uint zmij = siq % Md4_K_USGM; + const uint zm = zmij / (Md4 * K); + const uint ij = zmij % (Md4 * K); + const uint i = ij / Md4; + const uint j = ij % Md4; + + const uint gk = (ki + zk) * K + i; + const uint gm = (mi + zm) * Md4 + j; + + const uint gi = (gz * inch + gk) * cstepd4 + gm; + prefetch_tmp_v[q] = ivec2(i8buffer_sm4(bottom_tm_low_data, gi), i8buffer_sm4(bottom_tm_high_data, gi)); + } + } + } + else // if (elempack == 4) + { + const uint inchd4 = inch / 4; + const uint cstepd2 = psc(cstep) / 2; + + [[unroll]] for (uint q = 0; q < V_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (V_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < V_USGM_USGK) + { + const uint zk = siq / M_Kd4p_USGM; + const uint zmij = siq % M_Kd4p_USGM; + const uint zm = zmij / (M * Kd4p); + const uint ij = zmij % (M * Kd4p); + const uint i = ij / Kd4p; + const uint j = min(ij % Kd4p, Kd4 - 1); + + const uint gm = (mi + zm) * M + i; + const uint gk = (ki + zk) * Kd4 + j; + + const uint gi = (gz * inchd4 + gk) * cstepd2 + gm; + prefetch_tmp_v[q] = bottom_tm_low_high_data[gi]; + } + } + } + } + { + const uint w_offset = gz * wbstep + (wgni * kk_padded * UNROLL_WG_N) * K_Nd4p_USGN + sgni * K_Nd4p_USGN_USGK; + + [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si; + + if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK) + { + prefetch_tmp_k[q] = weight_tm_data[w_offset + siq]; + } + } + } + + uint k = 0; + for (; k + UNROLL_SG_K < kk; k += UNROLL_SG_K) + { + // copy prefetched tile to shared memory + { + [[unroll]] for (uint q = 0; q < V_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (V_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < V_USGM_USGK) + { + tmp_v0[sgmi][siq] = prefetch_tmp_v[q].x; + tmp_v1[sgmi][siq] = prefetch_tmp_v[q].y; + } + } + } + { + [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si; + + if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK) + { + tmp_k0[sgni][siq] = prefetch_tmp_k[q].x; + tmp_k1[sgni][siq] = prefetch_tmp_k[q].y; + } + } + } + + barrier(); + + // prefetch next tile + const uint ki = k + UNROLL_SG_K; + { + if (elempack == 1) + { + const uint cstepd4 = psc(cstep) / 4; + + [[unroll]] for (uint q = 0; q < V_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (V_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < V_USGM_USGK) + { + const uint zk = siq / Md4_K_USGM; + const uint zmij = siq % Md4_K_USGM; + const uint zm = zmij / (Md4 * K); + const uint ij = zmij % (Md4 * K); + const uint i = ij / Md4; + const uint j = ij % Md4; + + const uint gk = (ki + zk) * K + i; + const uint gm = (mi + zm) * Md4 + j; + + const uint gi = (gz * inch + gk) * cstepd4 + gm; + prefetch_tmp_v[q] = ivec2(i8buffer_sm4(bottom_tm_low_data, gi), i8buffer_sm4(bottom_tm_high_data, gi)); + } + } + } + else // if (elempack == 4) + { + const uint inchd4 = inch / 4; + const uint cstepd2 = psc(cstep) / 2; + + [[unroll]] for (uint q = 0; q < V_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (V_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < V_USGM_USGK) + { + const uint zk = siq / M_Kd4p_USGM; + const uint zmij = siq % M_Kd4p_USGM; + const uint zm = zmij / (M * Kd4p); + const uint ij = zmij % (M * Kd4p); + const uint i = ij / Kd4p; + const uint j = min(ij % Kd4p, Kd4 - 1); + + const uint gm = (mi + zm) * M + i; + const uint gk = (ki + zk) * Kd4 + j; + + const uint gi = (gz * inchd4 + gk) * cstepd2 + gm; + prefetch_tmp_v[q] = bottom_tm_low_high_data[gi]; + } + } + } + } + { + const uint w_offset = gz * wbstep + (wgni * kk_padded * UNROLL_WG_N) * K_Nd4p_USGN + ((ki / UNROLL_SG_K) * UNROLL_WG_N + sgni) * K_Nd4p_USGN_USGK; + + [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si; + + if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK) + { + prefetch_tmp_k[q] = weight_tm_data[w_offset + siq]; + } + } + } + +#if ncnn_VK_KHR_cooperative_matrix + coopmat A0[UNROLL_SG_M]; + coopmat A1[UNROLL_SG_M]; + coopmat B0[UNROLL_SG_N]; + coopmat B1[UNROLL_SG_N]; +#elif ncnn_VK_NV_cooperative_matrix + icoopmatNV<8, gl_ScopeSubgroup, M, K> A0[UNROLL_SG_M]; + icoopmatNV<8, gl_ScopeSubgroup, M, K> A1[UNROLL_SG_M]; + icoopmatNV<8, gl_ScopeSubgroup, K, N> B0[UNROLL_SG_N]; + icoopmatNV<8, gl_ScopeSubgroup, K, N> B1[UNROLL_SG_N]; +#endif + + [[unroll]] for (uint zk = 0; zk < UNROLL_SG_K; zk++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { + if (elempack == 1) + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(A0[zm], tmp_v0[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, gl_CooperativeMatrixLayoutColumnMajor); + coopMatLoad(A1[zm], tmp_v1[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, gl_CooperativeMatrixLayoutColumnMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(A0[zm], tmp_v0[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, true); + coopMatLoadNV(A1[zm], tmp_v1[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, true); +#endif + } + else + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(A0[zm], tmp_v0[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(A1[zm], tmp_v1[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(A0[zm], tmp_v0[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false); + coopMatLoadNV(A1[zm], tmp_v1[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false); +#endif + } + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { +#if ncnn_VK_KHR_cooperative_matrix + const uint k_offset = (zk * UNROLL_SG_N + zn) * (K * Nd4p); + coopMatLoad(B0[zn], tmp_k0[sgni], k_offset, Nd4p, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B1[zn], tmp_k1[sgni], k_offset, Nd4p, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + const uint k_offset = (zk * UNROLL_SG_N + zn) * (K * Nd4p); + coopMatLoadNV(B0[zn], tmp_k0[sgni], k_offset, Nd4p, false); + coopMatLoadNV(B1[zn], tmp_k1[sgni], k_offset, Nd4p, false); +#endif + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { +#if ncnn_VK_KHR_cooperative_matrix + sum[zn][zm] = coopMatMulAdd(A0[zm], B0[zn], sum[zn][zm]); + sum8[zn][zm] = coopMatMulAdd(A1[zm], B0[zn], sum8[zn][zm]); + sum16[zn][zm] = coopMatMulAdd(A1[zm], B1[zn], sum16[zn][zm]); + sum8[zn][zm] = coopMatMulAdd(A0[zm], B1[zn], sum8[zn][zm]); +#elif ncnn_VK_NV_cooperative_matrix + sum[zn][zm] = coopMatMulAddNV(A0[zm], B0[zn], sum[zn][zm]); + sum8[zn][zm] = coopMatMulAddNV(A1[zm], B0[zn], sum8[zn][zm]); + sum16[zn][zm] = coopMatMulAddNV(A1[zm], B1[zn], sum16[zn][zm]); + sum8[zn][zm] = coopMatMulAddNV(A0[zm], B1[zn], sum8[zn][zm]); +#endif + } + } + } + + barrier(); + } + + // the last copy prefetch to shared memory + { + [[unroll]] for (uint q = 0; q < V_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (V_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < V_USGM_USGK) + { + tmp_v0[sgmi][siq] = prefetch_tmp_v[q].x; + tmp_v1[sgmi][siq] = prefetch_tmp_v[q].y; + } + } + } + { + [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si; + + if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK) + { + tmp_k0[sgni][siq] = prefetch_tmp_k[q].x; + tmp_k1[sgni][siq] = prefetch_tmp_k[q].y; + } + } + } + + barrier(); + +#if ncnn_VK_KHR_cooperative_matrix + coopmat A0[UNROLL_SG_M]; + coopmat A1[UNROLL_SG_M]; + coopmat B0[UNROLL_SG_N]; + coopmat B1[UNROLL_SG_N]; +#elif ncnn_VK_NV_cooperative_matrix + icoopmatNV<8, gl_ScopeSubgroup, M, K> A0[UNROLL_SG_M]; + icoopmatNV<8, gl_ScopeSubgroup, M, K> A1[UNROLL_SG_M]; + icoopmatNV<8, gl_ScopeSubgroup, K, N> B0[UNROLL_SG_N]; + icoopmatNV<8, gl_ScopeSubgroup, K, N> B1[UNROLL_SG_N]; +#endif + + [[unroll]] for (uint zk = 0; zk < UNROLL_SG_K; zk++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { + if (elempack == 1) + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(A0[zm], tmp_v0[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, gl_CooperativeMatrixLayoutColumnMajor); + coopMatLoad(A1[zm], tmp_v1[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, gl_CooperativeMatrixLayoutColumnMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(A0[zm], tmp_v0[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, true); + coopMatLoadNV(A1[zm], tmp_v1[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, true); +#endif + } + else + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(A0[zm], tmp_v0[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(A1[zm], tmp_v1[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(A0[zm], tmp_v0[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false); + coopMatLoadNV(A1[zm], tmp_v1[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false); +#endif + } + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { +#if ncnn_VK_KHR_cooperative_matrix + const uint k_offset = (zk * UNROLL_SG_N + zn) * (K * Nd4p); + coopMatLoad(B0[zn], tmp_k0[sgni], k_offset, Nd4p, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B1[zn], tmp_k1[sgni], k_offset, Nd4p, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + const uint k_offset = (zk * UNROLL_SG_N + zn) * (K * Nd4p); + coopMatLoadNV(B0[zn], tmp_k0[sgni], k_offset, Nd4p, false); + coopMatLoadNV(B1[zn], tmp_k1[sgni], k_offset, Nd4p, false); +#endif + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { +#if ncnn_VK_KHR_cooperative_matrix + sum[zn][zm] = coopMatMulAdd(A0[zm], B0[zn], sum[zn][zm]); + sum8[zn][zm] = coopMatMulAdd(A1[zm], B0[zn], sum8[zn][zm]); + sum16[zn][zm] = coopMatMulAdd(A1[zm], B1[zn], sum16[zn][zm]); + sum8[zn][zm] = coopMatMulAdd(A0[zm], B1[zn], sum8[zn][zm]); +#elif ncnn_VK_NV_cooperative_matrix + sum[zn][zm] = coopMatMulAddNV(A0[zm], B0[zn], sum[zn][zm]); + sum8[zn][zm] = coopMatMulAddNV(A1[zm], B0[zn], sum8[zn][zm]); + sum16[zn][zm] = coopMatMulAddNV(A1[zm], B1[zn], sum16[zn][zm]); + sum8[zn][zm] = coopMatMulAddNV(A0[zm], B1[zn], sum8[zn][zm]); +#endif + } + } + } + } + else + { + // no ping-pong version + for (uint k = 0; k < kk; k += UNROLL_SG_K) + { + { + const uint Md4_K_USGM = Md4 * K * UNROLL_SG_M; + const uint Md4_K_USGM_USGK = Md4_K_USGM * UNROLL_SG_K; + const uint M_Kd4p_USGM = M * Kd4p * UNROLL_SG_M; + const uint M_Kd4p_USGM_USGK = M_Kd4p_USGM * UNROLL_SG_K; + const uint V_USGM_USGK = elempack == 1 ? Md4_K_USGM_USGK : M_Kd4p_USGM_USGK; + const uint V_USGM_USGK_d_subgroupsize = (V_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N); + [[unroll]] for (uint q = 0; q < V_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (V_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < V_USGM_USGK) + { + if (elempack == 1) + { + const uint zk = siq / Md4_K_USGM; + const uint zmij = siq % Md4_K_USGM; + const uint zm = zmij / (Md4 * K); + const uint ij = zmij % (Md4 * K); + const uint i = ij / Md4; + const uint j = ij % Md4; + + const uint gk = (k + zk) * K + i; + const uint gm = (mi + zm) * Md4 + j; + + const uint cstepd4 = psc(cstep) / 4; + const uint gi = (gz * inch + gk) * cstepd4 + gm; + tmp_v0[sgmi][siq] = i8buffer_sm4(bottom_tm_low_data, gi); + tmp_v1[sgmi][siq] = i8buffer_sm4(bottom_tm_high_data, gi); + } + else + { + const uint cstepd2 = psc(cstep) / 2; + const uint zk = siq / M_Kd4p_USGM; + const uint zmij = siq % M_Kd4p_USGM; + const uint zm = zmij / (M * Kd4p); + const uint ij = zmij % (M * Kd4p); + const uint i = ij / Kd4p; + const uint j = min(ij % Kd4p, Kd4 - 1); + + const uint gm = (mi + zm) * M + i; + const uint gk = (k + zk) * Kd4 + j; + + const ivec2 v01 = bottom_tm_low_high_data[(gz * (inch / 4) + gk) * cstepd2 + gm]; + tmp_v0[sgmi][siq] = v01.x; + tmp_v1[sgmi][siq] = v01.y; + } + } + } + } + { + const uint K_Nd4p = K * Nd4p; + const uint K_Nd4p_USGN = K_Nd4p * UNROLL_SG_N; + const uint K_Nd4p_USGN_USGK = K_Nd4p_USGN * UNROLL_SG_K; + const uint w_offset = gz * wbstep + (wgni * kk_padded * UNROLL_WG_N) * K_Nd4p_USGN + ((k / UNROLL_SG_K) * UNROLL_WG_N + sgni) * K_Nd4p_USGN_USGK; + const uint K_Nd4p_USGN_USGK_d_subgroupsize = (K_Nd4p_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M); + [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si; + + if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK) + { + const ivec2 k01 = weight_tm_data[w_offset + siq]; + tmp_k0[sgni][siq] = k01.x; + tmp_k1[sgni][siq] = k01.y; + } + } + } + + barrier(); + +#if ncnn_VK_KHR_cooperative_matrix + coopmat A0[UNROLL_SG_M]; + coopmat A1[UNROLL_SG_M]; + coopmat B0[UNROLL_SG_N]; + coopmat B1[UNROLL_SG_N]; +#elif ncnn_VK_NV_cooperative_matrix + icoopmatNV<8, gl_ScopeSubgroup, M, K> A0[UNROLL_SG_M]; + icoopmatNV<8, gl_ScopeSubgroup, M, K> A1[UNROLL_SG_M]; + icoopmatNV<8, gl_ScopeSubgroup, K, N> B0[UNROLL_SG_N]; + icoopmatNV<8, gl_ScopeSubgroup, K, N> B1[UNROLL_SG_N]; +#endif + + [[unroll]] for (uint zk = 0; zk < UNROLL_SG_K; zk++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { + if (elempack == 1) + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(A0[zm], tmp_v0[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, gl_CooperativeMatrixLayoutColumnMajor); + coopMatLoad(A1[zm], tmp_v1[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, gl_CooperativeMatrixLayoutColumnMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(A0[zm], tmp_v0[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, true); + coopMatLoadNV(A1[zm], tmp_v1[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, true); +#endif + } + else + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(A0[zm], tmp_v0[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(A1[zm], tmp_v1[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(A0[zm], tmp_v0[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false); + coopMatLoadNV(A1[zm], tmp_v1[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false); +#endif + } + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { +#if ncnn_VK_KHR_cooperative_matrix + const uint k_offset = (zk * UNROLL_SG_N + zn) * (K * Nd4p); + coopMatLoad(B0[zn], tmp_k0[sgni], k_offset, Nd4p, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B1[zn], tmp_k1[sgni], k_offset, Nd4p, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + const uint k_offset = (zk * UNROLL_SG_N + zn) * (K * Nd4p); + coopMatLoadNV(B0[zn], tmp_k0[sgni], k_offset, Nd4p, false); + coopMatLoadNV(B1[zn], tmp_k1[sgni], k_offset, Nd4p, false); +#endif + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { +#if ncnn_VK_KHR_cooperative_matrix + sum[zn][zm] = coopMatMulAdd(A0[zm], B0[zn], sum[zn][zm]); + sum8[zn][zm] = coopMatMulAdd(A1[zm], B0[zn], sum8[zn][zm]); + sum16[zn][zm] = coopMatMulAdd(A1[zm], B1[zn], sum16[zn][zm]); + sum8[zn][zm] = coopMatMulAdd(A0[zm], B1[zn], sum8[zn][zm]); +#elif ncnn_VK_NV_cooperative_matrix + sum[zn][zm] = coopMatMulAddNV(A0[zm], B0[zn], sum[zn][zm]); + sum8[zn][zm] = coopMatMulAddNV(A1[zm], B0[zn], sum8[zn][zm]); + sum16[zn][zm] = coopMatMulAddNV(A1[zm], B1[zn], sum16[zn][zm]); + sum8[zn][zm] = coopMatMulAddNV(A0[zm], B1[zn], sum8[zn][zm]); +#endif + } + } + } + + barrier(); + } + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { + sum[zn][zm] += sum8[zn][zm] * (1 << 8) + sum16[zn][zm] * (1 << 16); + } + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { + if (out_elempack == 1) + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatStore(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (Md4 * N), Md4, gl_CooperativeMatrixLayoutColumnMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatStoreNV(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (Md4 * N), Md4, true); +#endif + } + else + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatStore(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (M * Nd4), Nd4, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatStoreNV(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (M * Nd4), Nd4, false); +#endif + } + } + } + + barrier(); + + if (out_elempack == 1) + { + const uint Md4_N_USGM_USGN = Md4 * N * UNROLL_SG_M * UNROLL_SG_N; + const uint Md4_N_USGM_USGN_d_subgroupsize = (Md4_N_USGM_USGN + subgroup_size - 1) / subgroup_size; + [[unroll]] for (uint q = 0; q < Md4_N_USGM_USGN_d_subgroupsize; q++) + { + const uint siq = si + q * subgroup_size; + + if (Md4_N_USGM_USGN % subgroup_size == 0 || siq < Md4_N_USGM_USGN) + { + const uint zn = siq / (Md4 * N * UNROLL_SG_M); + const uint zmij = siq % (Md4 * N * UNROLL_SG_M); + const uint zm = zmij / (Md4 * N); + const uint ij = zmij % (Md4 * N); + const uint i = ij / Md4; + const uint j = ij % Md4; + + const uint gn = (ni + zn) * N + i; + const uint gm = (mi + zm) * Md4 + j; + + if (gn < outch && gm * 4 < psc(size)) + { + const ivec4 sumi = tmp_o[sgi][siq]; + const uint gi = (gz * outch + gn) * psc(outcstep) + gm * 4; + + top_tm_blob_data[gi + 0] = sumi.r; + if (gm * 4 + 1 < psc(size)) top_tm_blob_data[gi + 1] = sumi.g; + if (gm * 4 + 2 < psc(size)) top_tm_blob_data[gi + 2] = sumi.b; + if (gm * 4 + 3 < psc(size)) top_tm_blob_data[gi + 3] = sumi.a; + } + } + } + } + else + { + const uint M_Nd4_USGM_USGN = M * Nd4 * UNROLL_SG_M * UNROLL_SG_N; + const uint M_Nd4_USGM_USGN_d_subgroupsize = (M_Nd4_USGM_USGN + subgroup_size - 1) / subgroup_size; + [[unroll]] for (uint q = 0; q < M_Nd4_USGM_USGN_d_subgroupsize; q++) + { + const uint siq = si + q * subgroup_size; + + if (M_Nd4_USGM_USGN % subgroup_size == 0 || siq < M_Nd4_USGM_USGN) + { + const uint zn = siq / (M * Nd4 * UNROLL_SG_M); + const uint zmij = siq % (M * Nd4 * UNROLL_SG_M); + const uint zm = zmij / (M * Nd4); + const uint ij = zmij % (M * Nd4); + const uint i = ij / Nd4; + const uint j = ij % Nd4; + + const uint gm = (mi + zm) * M + i; + const uint gn = (ni + zn) * N + j * 4; + + if (gm < psc(size) && gn < outch) + { + const ivec4 sumi = tmp_o[sgi][siq]; + top_tm_blob_data_4[(gz * (outch / 4) + gn / 4) * psc(outcstep) + gm] = sumi; + } + } + } + } +} diff --git a/src/layer/vulkan/shader/convolution_gemm_int8_cm.comp b/src/layer/vulkan/shader/convolution_gemm_int8_cm.comp new file mode 100644 index 000000000000..f52d1fdb784f --- /dev/null +++ b/src/layer/vulkan/shader/convolution_gemm_int8_cm.comp @@ -0,0 +1,890 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#extension GL_EXT_control_flow_attributes : require + +#extension GL_KHR_shader_subgroup_basic : require + +#extension GL_KHR_memory_scope_semantics : require +#extension GL_EXT_shader_explicit_arithmetic_types : require +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require +#if ncnn_VK_KHR_cooperative_matrix +#extension GL_KHR_cooperative_matrix : require +#elif ncnn_VK_NV_cooperative_matrix +#extension GL_NV_cooperative_matrix : require +#extension GL_NV_integer_cooperative_matrix : require +#endif + +#extension GL_GOOGLE_include_directive : enable +#include "vulkan_activation.comp" + +layout(constant_id = 0) const uint kernel_w = 1; +layout(constant_id = 1) const uint kernel_h = 1; +layout(constant_id = 2) const uint dilation_w = 1; +layout(constant_id = 3) const uint dilation_h = 1; +layout(constant_id = 4) const uint stride_w = 1; +layout(constant_id = 5) const uint stride_h = 1; +layout(constant_id = 6) const int bias_term = 0; +layout(constant_id = 7) const int activation_type = 0; +layout(constant_id = 8) const float activation_param_0 = 0; +layout(constant_id = 9) const float activation_param_1 = 0; +layout(constant_id = 10) const int use_int8_requantize = 0; +layout(constant_id = 11) const uint elempack = 1; +layout(constant_id = 12) const uint out_elempack = 1; + +#define shape_constant_id_offset 13 +layout(constant_id = shape_constant_id_offset + 0) const uint w = 0; +layout(constant_id = shape_constant_id_offset + 1) const uint h = 0; +layout(constant_id = shape_constant_id_offset + 2) const uint cstep = 0; +layout(constant_id = shape_constant_id_offset + 3) const uint outw = 0; +layout(constant_id = shape_constant_id_offset + 4) const uint outh = 0; +layout(constant_id = shape_constant_id_offset + 5) const uint outcstep = 0; +layout(constant_id = shape_constant_id_offset + 6) const uint num_output = 0; +layout(constant_id = shape_constant_id_offset + 7) const uint num_input = 0; + +layout(constant_id = shape_constant_id_offset + 8 + 0) const uint M = 1; +layout(constant_id = shape_constant_id_offset + 8 + 1) const uint N = 1; +layout(constant_id = shape_constant_id_offset + 8 + 2) const uint K = 1; +layout(constant_id = shape_constant_id_offset + 8 + 3) const uint subgroup_size = 32; +layout(constant_id = shape_constant_id_offset + 8 + 4) const uint UNROLL_SG_M = 2; +layout(constant_id = shape_constant_id_offset + 8 + 5) const uint UNROLL_SG_N = 2; +layout(constant_id = shape_constant_id_offset + 8 + 6) const uint UNROLL_SG_K = 2; +layout(constant_id = shape_constant_id_offset + 8 + 7) const uint UNROLL_WG_M = 2; +layout(constant_id = shape_constant_id_offset + 8 + 8) const uint UNROLL_WG_N = 2; + +layout(binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_int8_data[]; }; +layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout(binding = 2) writeonly buffer top_blob_int8 { sint8vec4 top_blob_int8_data[]; }; +layout(binding = 3) readonly buffer weight_blob { sint8vec4 weight_data[]; }; +layout(binding = 4) readonly buffer bias_blob { vec4 bias_data[]; }; +layout(binding = 5) readonly buffer weight_descales_blob { vec4 weight_descales_data[]; }; +layout(binding = 6) readonly buffer top_scales_blob { sfp top_scales_data[]; }; + +layout(push_constant) uniform parameter +{ + uint w; + uint h; + uint cstep; + uint outw; + uint outh; + uint outcstep; + uint num_output; + uint num_input; +} p; + +const uint Nd4 = N / 4; +const uint Kd4 = K / 4; +const uint Md4 = M / 4; + +#if ncnn_VK_KHR_cooperative_matrix +#define PAD 1 +#elif ncnn_VK_NV_cooperative_matrix +#define PAD 0 +#endif + +const uint Nd4p = Nd4 + PAD; +const uint Kd4p = Kd4 + PAD; + +shared int tmp_v[UNROLL_WG_M][UNROLL_SG_K * UNROLL_SG_M * M * Kd4p]; +shared int tmp_k[UNROLL_WG_N][UNROLL_SG_K * UNROLL_SG_N * K * Nd4p]; +shared ivec4 tmp_o[UNROLL_WG_N * UNROLL_WG_M][UNROLL_SG_N * UNROLL_SG_M * M * N / 4]; + +void main() +{ + const uint maxk = kernel_w * kernel_h; + const uint size = psc(outw) * psc(outh); + const uint K0 = psc(num_input) * maxk; + const uint K04 = psc(num_input) / 4 * maxk; + + const uint wgi = gl_WorkGroupID.x; + const uint sgi = gl_SubgroupID; + + const uint wgmm = (size + M * UNROLL_SG_M * UNROLL_WG_M - 1) / (M * UNROLL_SG_M * UNROLL_WG_M); + const uint wgnn = (psc(num_output) + N * UNROLL_SG_N * UNROLL_WG_N - 1) / (N * UNROLL_SG_N * UNROLL_WG_N); + + const uint wgmi = wgi / wgnn; + const uint wgni = wgi % wgnn; + + const uint sgmi = sgi / UNROLL_WG_N; + const uint sgni = sgi % UNROLL_WG_N; + + const uint kk = (K0 + K - 1) / K; + const uint kk_padded = (kk + UNROLL_SG_K - 1) / UNROLL_SG_K * UNROLL_SG_K; + + if (wgmi >= wgmm) + return; + + const uint si = gl_SubgroupInvocationID; + + const uint mi = (wgmi * UNROLL_WG_M + sgmi) * UNROLL_SG_M; + const uint ni = (wgni * UNROLL_WG_N + sgni) * UNROLL_SG_N; + +#if ncnn_VK_KHR_cooperative_matrix + coopmat sum[UNROLL_SG_N][UNROLL_SG_M]; +#elif ncnn_VK_NV_cooperative_matrix + icoopmatNV<32, gl_ScopeSubgroup, M, N> sum[UNROLL_SG_N][UNROLL_SG_M]; +#endif + + { + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { +#if ncnn_VK_KHR_cooperative_matrix + sum[zn][zm] = coopmat(0); +#elif ncnn_VK_NV_cooperative_matrix + sum[zn][zm] = icoopmatNV<32, gl_ScopeSubgroup, M, N>(0); +#endif + } + } + } + + uint k = 0; + + if (kk >= UNROLL_SG_K * 2) + { + // local stack and shared memory ping-pong + + // prefetch + int prefetch_tmp_v[(UNROLL_SG_M * UNROLL_SG_K * M * Kd4p + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N)]; + int prefetch_tmp_k[(UNROLL_SG_N * UNROLL_SG_K * K * Nd4p + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M)]; + + // prefetch the very first + { + if (elempack == 1) + { + const uint Md4_K_USGM = Md4 * K * UNROLL_SG_M; + const uint Md4_K_USGM_USGK = Md4_K_USGM * UNROLL_SG_K; + const uint Md4_K_USGM_USGK_d_subgroupsize = (Md4_K_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N); + [[unroll]] for (uint q = 0; q < Md4_K_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (Md4_K_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < Md4_K_USGM_USGK) + { + const uint zk = siq / Md4_K_USGM; + const uint zmij = siq % Md4_K_USGM; + const uint zm = zmij / (Md4 * K); + const uint ij = zmij % (Md4 * K); + const uint i = ij / Md4; + const uint j = ij % Md4; + + const uint gk = zk * K + i; + const uint gm = (mi + zm) * Md4 + j; + const uvec4 gm4 = gm * 4 + uvec4(0, 1, 2, 3); + + ivec4 v4 = ivec4(0); + if (gk < K0) + { + const uint sz = gk / maxk; + const uint k = gk - sz * maxk; + const uint ky = k / kernel_w; + const uint kx = k - ky * kernel_w; + + const uvec4 sy = gm4 / psc(outw); + const uvec4 sx = gm4 - sy * psc(outw); + const uvec4 spatial = (sy * stride_h + ky * dilation_h) * psc(w) + sx * stride_w + kx * dilation_w; + + if (gm4.r < size) + { + const uint gi = sz * psc(cstep) + spatial.r; + v4.r = i8buffer_ld4(bottom_blob_int8_data, gi / 4)[gi % 4]; + } + if (gm4.g < size) + { + const uint gi = sz * psc(cstep) + spatial.g; + v4.g = i8buffer_ld4(bottom_blob_int8_data, gi / 4)[gi % 4]; + } + if (gm4.b < size) + { + const uint gi = sz * psc(cstep) + spatial.b; + v4.b = i8buffer_ld4(bottom_blob_int8_data, gi / 4)[gi % 4]; + } + if (gm4.a < size) + { + const uint gi = sz * psc(cstep) + spatial.a; + v4.a = i8buffer_ld4(bottom_blob_int8_data, gi / 4)[gi % 4]; + } + } + + prefetch_tmp_v[q] = packInt4x8(v4); + } + } + } + else // elempack == 4 + { + const uint M_Kd4p_USGM = M * Kd4p * UNROLL_SG_M; + const uint M_Kd4p_USGM_USGK = M_Kd4p_USGM * UNROLL_SG_K; + const uint M_Kd4p_USGM_USGK_d_subgroupsize = (M_Kd4p_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N); + [[unroll]] for (uint q = 0; q < M_Kd4p_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (M_Kd4p_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4p_USGM_USGK) + { + const uint zk = siq / M_Kd4p_USGM; + const uint zmij = siq % M_Kd4p_USGM; + const uint zmi = zmij / Kd4p; + const uint j = zmij % Kd4p; + + const uint gm = mi * M + zmi; + const uint gk = zk * Kd4 + j; + + int v = 0; + if (gm < size && gk < K04) + { + const uint sx = gm % psc(outw); + const uint sy = gm / psc(outw); + const uint sz = gk / maxk; + const uint k = gk - sz * maxk; + const uint ky = k / kernel_w; + const uint kx = k - ky * kernel_w; + const uint spatial = (sy * stride_h + ky * dilation_h) * psc(w) + sx * stride_w + kx * dilation_w; + + v = i8buffer_sm4(bottom_blob_int8_data, sz * psc(cstep) + spatial); + } + + prefetch_tmp_v[q] = v; + } + } + } + } + { + const uint K_Nd4p_USGN = K * Nd4p * UNROLL_SG_N; + const uint K_Nd4p_USGN_USGK = K_Nd4p_USGN * UNROLL_SG_K; + const uint w_offset = (wgni * kk_padded * UNROLL_WG_N) * K_Nd4p_USGN + sgni * K_Nd4p_USGN_USGK; + const uint K_Nd4p_USGN_USGK_d_subgroupsize = (K_Nd4p_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M); + [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si; + + if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK) + { + prefetch_tmp_k[q] = i8buffer_sm4(weight_data, w_offset + siq); + } + } + } + + for (; k + UNROLL_SG_K < kk; k += UNROLL_SG_K) + { + // copy prefetched tile to shared memory + { + if (elempack == 1) + { + const uint Md4_K_USGM_USGK = Md4 * K * UNROLL_SG_M * UNROLL_SG_K; + const uint Md4_K_USGM_USGK_d_subgroupsize = (Md4_K_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N); + [[unroll]] for (uint q = 0; q < Md4_K_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (Md4_K_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < Md4_K_USGM_USGK) + { + tmp_v[sgmi][siq] = prefetch_tmp_v[q]; + } + } + } + else // elempack == 4 + { + const uint M_Kd4p_USGM_USGK = M * Kd4p * UNROLL_SG_M * UNROLL_SG_K; + const uint M_Kd4p_USGM_USGK_d_subgroupsize = (M_Kd4p_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N); + [[unroll]] for (uint q = 0; q < M_Kd4p_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (M_Kd4p_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4p_USGM_USGK) + { + tmp_v[sgmi][siq] = prefetch_tmp_v[q]; + } + } + } + } + { + const uint K_Nd4p_USGN_USGK = K * Nd4p * UNROLL_SG_N * UNROLL_SG_K; + const uint K_Nd4p_USGN_USGK_d_subgroupsize = (K_Nd4p_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M); + [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si; + + if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK) + { + tmp_k[sgni][siq] = prefetch_tmp_k[q]; + } + } + } + + barrier(); + + // prefetch next tile + const uint ki = k + UNROLL_SG_K; + { + if (elempack == 1) + { + const uint Md4_K_USGM = Md4 * K * UNROLL_SG_M; + const uint Md4_K_USGM_USGK = Md4_K_USGM * UNROLL_SG_K; + const uint Md4_K_USGM_USGK_d_subgroupsize = (Md4_K_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N); + [[unroll]] for (uint q = 0; q < Md4_K_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (Md4_K_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < Md4_K_USGM_USGK) + { + const uint zk = siq / Md4_K_USGM; + const uint zmij = siq % Md4_K_USGM; + const uint zm = zmij / (Md4 * K); + const uint ij = zmij % (Md4 * K); + const uint i = ij / Md4; + const uint j = ij % Md4; + + const uint gk = (ki + zk) * K + i; + const uint gm = (mi + zm) * Md4 + j; + const uvec4 gm4 = gm * 4 + uvec4(0, 1, 2, 3); + + ivec4 v4 = ivec4(0); + if (gk < K0) + { + const uint sz = gk / maxk; + const uint k = gk - sz * maxk; + const uint ky = k / kernel_w; + const uint kx = k - ky * kernel_w; + + const uvec4 sy = gm4 / psc(outw); + const uvec4 sx = gm4 - sy * psc(outw); + const uvec4 spatial = (sy * stride_h + ky * dilation_h) * psc(w) + sx * stride_w + kx * dilation_w; + + if (gm4.r < size) + { + const uint gi = sz * psc(cstep) + spatial.r; + v4.r = i8buffer_ld4(bottom_blob_int8_data, gi / 4)[gi % 4]; + } + if (gm4.g < size) + { + const uint gi = sz * psc(cstep) + spatial.g; + v4.g = i8buffer_ld4(bottom_blob_int8_data, gi / 4)[gi % 4]; + } + if (gm4.b < size) + { + const uint gi = sz * psc(cstep) + spatial.b; + v4.b = i8buffer_ld4(bottom_blob_int8_data, gi / 4)[gi % 4]; + } + if (gm4.a < size) + { + const uint gi = sz * psc(cstep) + spatial.a; + v4.a = i8buffer_ld4(bottom_blob_int8_data, gi / 4)[gi % 4]; + } + } + + prefetch_tmp_v[q] = packInt4x8(v4); + } + } + } + else // elempack == 4 + { + const uint M_Kd4p_USGM = M * Kd4p * UNROLL_SG_M; + const uint M_Kd4p_USGM_USGK = M_Kd4p_USGM * UNROLL_SG_K; + const uint M_Kd4p_USGM_USGK_d_subgroupsize = (M_Kd4p_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N); + [[unroll]] for (uint q = 0; q < M_Kd4p_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (M_Kd4p_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4p_USGM_USGK) + { + const uint zk = siq / M_Kd4p_USGM; + const uint zmij = siq % M_Kd4p_USGM; + const uint zmi = zmij / Kd4p; + const uint j = zmij % Kd4p; + + const uint gm = mi * M + zmi; + const uint gk = (ki + zk) * Kd4 + j; + + int v = 0; + if (gm < size && gk < K04) + { + const uint sx = gm % psc(outw); + const uint sy = gm / psc(outw); + const uint sz = gk / maxk; + const uint k = gk - sz * maxk; + const uint ky = k / kernel_w; + const uint kx = k - ky * kernel_w; + const uint spatial = (sy * stride_h + ky * dilation_h) * psc(w) + sx * stride_w + kx * dilation_w; + + v = i8buffer_sm4(bottom_blob_int8_data, sz * psc(cstep) + spatial); + } + + prefetch_tmp_v[q] = v; + } + } + } + } + { + const uint K_Nd4p_USGN = K * Nd4p * UNROLL_SG_N; + const uint K_Nd4p_USGN_USGK = K_Nd4p_USGN * UNROLL_SG_K; + const uint K_Nd4p_USGN_USGK_d_subgroupsize = (K_Nd4p_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M); + const uint w_offset = (wgni * kk_padded * UNROLL_WG_N) * K_Nd4p_USGN + ((ki / UNROLL_SG_K) * UNROLL_WG_N + sgni) * K_Nd4p_USGN_USGK; + + [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si; + + if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK) + { + prefetch_tmp_k[q] = i8buffer_sm4(weight_data, w_offset + siq); + } + } + } + +#if ncnn_VK_KHR_cooperative_matrix + coopmat A[UNROLL_SG_M]; + coopmat B[UNROLL_SG_N]; +#elif ncnn_VK_NV_cooperative_matrix + icoopmatNV<8, gl_ScopeSubgroup, M, K> A[UNROLL_SG_M]; + icoopmatNV<8, gl_ScopeSubgroup, K, N> B[UNROLL_SG_N]; +#endif + + [[unroll]] for (uint zk = 0; zk < UNROLL_SG_K; zk++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { + if (elempack == 1) + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, gl_CooperativeMatrixLayoutColumnMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, true); +#endif + } + else // elempack == 4 + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false); +#endif + } + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(B[zn], tmp_k[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(B[zn], tmp_k[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, false); +#endif + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { +#if ncnn_VK_KHR_cooperative_matrix + sum[zn][zm] = coopMatMulAdd(A[zm], B[zn], sum[zn][zm]); +#elif ncnn_VK_NV_cooperative_matrix + sum[zn][zm] = coopMatMulAddNV(A[zm], B[zn], sum[zn][zm]); +#endif + } + } + } + + barrier(); + } + + // copy and compute the last prefetched tile + { + if (elempack == 1) + { + const uint Md4_K_USGM_USGK = Md4 * K * UNROLL_SG_M * UNROLL_SG_K; + const uint Md4_K_USGM_USGK_d_subgroupsize = (Md4_K_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N); + [[unroll]] for (uint q = 0; q < Md4_K_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (Md4_K_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < Md4_K_USGM_USGK) + { + tmp_v[sgmi][siq] = prefetch_tmp_v[q]; + } + } + } + else // elempack == 4 + { + const uint M_Kd4p_USGM_USGK = M * Kd4p * UNROLL_SG_M * UNROLL_SG_K; + const uint M_Kd4p_USGM_USGK_d_subgroupsize = (M_Kd4p_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N); + [[unroll]] for (uint q = 0; q < M_Kd4p_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (M_Kd4p_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4p_USGM_USGK) + { + tmp_v[sgmi][siq] = prefetch_tmp_v[q]; + } + } + } + } + { + const uint K_Nd4p_USGN_USGK = K * Nd4p * UNROLL_SG_N * UNROLL_SG_K; + const uint K_Nd4p_USGN_USGK_d_subgroupsize = (K_Nd4p_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M); + [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si; + + if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK) + { + tmp_k[sgni][siq] = prefetch_tmp_k[q]; + } + } + } + + barrier(); + +#if ncnn_VK_KHR_cooperative_matrix + coopmat A[UNROLL_SG_M]; + coopmat B[UNROLL_SG_N]; +#elif ncnn_VK_NV_cooperative_matrix + icoopmatNV<8, gl_ScopeSubgroup, M, K> A[UNROLL_SG_M]; + icoopmatNV<8, gl_ScopeSubgroup, K, N> B[UNROLL_SG_N]; +#endif + + [[unroll]] for (uint zk = 0; zk < UNROLL_SG_K; zk++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { + if (elempack == 1) + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, gl_CooperativeMatrixLayoutColumnMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, true); +#endif + } + else // elempack == 4 + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false); +#endif + } + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(B[zn], tmp_k[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(B[zn], tmp_k[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, false); +#endif + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { +#if ncnn_VK_KHR_cooperative_matrix + sum[zn][zm] = coopMatMulAdd(A[zm], B[zn], sum[zn][zm]); +#elif ncnn_VK_NV_cooperative_matrix + sum[zn][zm] = coopMatMulAddNV(A[zm], B[zn], sum[zn][zm]); +#endif + } + } + } + + barrier(); + } + else + { + for (uint ki = 0; ki < kk; ki += UNROLL_SG_K) + { + { + if (elempack == 1) + { + const uint Md4_K_USGM = Md4 * K * UNROLL_SG_M; + const uint Md4_K_USGM_USGK = Md4_K_USGM * UNROLL_SG_K; + const uint Md4_K_USGM_USGK_d_subgroupsize = (Md4_K_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N); + [[unroll]] for (uint q = 0; q < Md4_K_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (Md4_K_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < Md4_K_USGM_USGK) + { + const uint zk = siq / Md4_K_USGM; + const uint zmij = siq % Md4_K_USGM; + const uint zm = zmij / (Md4 * K); + const uint ij = zmij % (Md4 * K); + const uint i = ij / Md4; + const uint j = ij % Md4; + + const uint gk = (ki + zk) * K + i; + const uint gm = (mi + zm) * Md4 + j; + const uvec4 gm4 = gm * 4 + uvec4(0, 1, 2, 3); + + ivec4 v4 = ivec4(0); + if (gk < K0) + { + const uint sz = gk / maxk; + const uint k = gk - sz * maxk; + const uint ky = k / kernel_w; + const uint kx = k - ky * kernel_w; + + const uvec4 sy = gm4 / psc(outw); + const uvec4 sx = gm4 - sy * psc(outw); + const uvec4 spatial = (sy * stride_h + ky * dilation_h) * psc(w) + sx * stride_w + kx * dilation_w; + + if (gm4.r < size) + { + const uint gi = sz * psc(cstep) + spatial.r; + v4.r = i8buffer_ld4(bottom_blob_int8_data, gi / 4)[gi % 4]; + } + if (gm4.g < size) + { + const uint gi = sz * psc(cstep) + spatial.g; + v4.g = i8buffer_ld4(bottom_blob_int8_data, gi / 4)[gi % 4]; + } + if (gm4.b < size) + { + const uint gi = sz * psc(cstep) + spatial.b; + v4.b = i8buffer_ld4(bottom_blob_int8_data, gi / 4)[gi % 4]; + } + if (gm4.a < size) + { + const uint gi = sz * psc(cstep) + spatial.a; + v4.a = i8buffer_ld4(bottom_blob_int8_data, gi / 4)[gi % 4]; + } + } + + tmp_v[sgmi][siq] = packInt4x8(v4); + } + } + } + else // elempack == 4 + { + const uint M_Kd4p_USGM = M * Kd4p * UNROLL_SG_M; + const uint M_Kd4p_USGM_USGK = M_Kd4p_USGM * UNROLL_SG_K; + const uint M_Kd4p_USGM_USGK_d_subgroupsize = (M_Kd4p_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N); + [[unroll]] for (uint q = 0; q < M_Kd4p_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (M_Kd4p_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4p_USGM_USGK) + { + const uint zk = siq / M_Kd4p_USGM; + const uint zmij = siq % M_Kd4p_USGM; + const uint zmi = zmij / Kd4p; + const uint j = zmij % Kd4p; + + const uint gm = mi * M + zmi; + const uint gk = (ki + zk) * Kd4 + j; + + int v = 0; + if (gm < size && gk < K04) + { + const uint sx = gm % psc(outw); + const uint sy = gm / psc(outw); + const uint sz = gk / maxk; + const uint k = gk - sz * maxk; + const uint ky = k / kernel_w; + const uint kx = k - ky * kernel_w; + const uint spatial = (sy * stride_h + ky * dilation_h) * psc(w) + sx * stride_w + kx * dilation_w; + + v = i8buffer_sm4(bottom_blob_int8_data, sz * psc(cstep) + spatial); + } + + tmp_v[sgmi][siq] = v; + } + } + } + } + + { + const uint K_Nd4p_USGN = K * Nd4p * UNROLL_SG_N; + const uint K_Nd4p_USGN_USGK = K_Nd4p_USGN * UNROLL_SG_K; + const uint K_Nd4p_USGN_USGK_d_subgroupsize = (K_Nd4p_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M); + const uint w_offset = (wgni * kk_padded * UNROLL_WG_N) * K_Nd4p_USGN + ((ki / UNROLL_SG_K) * UNROLL_WG_N + sgni) * K_Nd4p_USGN_USGK; + [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si; + + if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK) + { + tmp_k[sgni][siq] = i8buffer_sm4(weight_data, w_offset + siq); + } + } + } + + barrier(); + +#if ncnn_VK_KHR_cooperative_matrix + coopmat A[UNROLL_SG_M]; + coopmat B[UNROLL_SG_N]; +#elif ncnn_VK_NV_cooperative_matrix + icoopmatNV<8, gl_ScopeSubgroup, M, K> A[UNROLL_SG_M]; + icoopmatNV<8, gl_ScopeSubgroup, K, N> B[UNROLL_SG_N]; +#endif + + [[unroll]] for (uint zk = 0; zk < UNROLL_SG_K; zk++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { + if (elempack == 1) + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, gl_CooperativeMatrixLayoutColumnMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, true); +#endif + } + else // elempack == 4 + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false); +#endif + } + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(B[zn], tmp_k[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(B[zn], tmp_k[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, false); +#endif + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { +#if ncnn_VK_KHR_cooperative_matrix + sum[zn][zm] = coopMatMulAdd(A[zm], B[zn], sum[zn][zm]); +#elif ncnn_VK_NV_cooperative_matrix + sum[zn][zm] = coopMatMulAddNV(A[zm], B[zn], sum[zn][zm]); +#endif + } + } + } + + barrier(); + } + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { + if (out_elempack == 1) + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatStore(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (Md4 * N), Md4, gl_CooperativeMatrixLayoutColumnMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatStoreNV(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (Md4 * N), Md4, true); +#endif + } + else // out_elempack == 4 + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatStore(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (M * Nd4), Nd4, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatStoreNV(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (M * Nd4), Nd4, false); +#endif + } + } + } + + barrier(); + + if (out_elempack == 1) + { + const uint outcstepd4 = psc(outcstep) / 4; + const uint Md4_N_USGM_USGN = Md4 * N * UNROLL_SG_M * UNROLL_SG_N; + const uint Md4_N_USGM_USGN_d_subgroupsize = (Md4_N_USGM_USGN + subgroup_size - 1) / subgroup_size; + [[unroll]] for (uint q = 0; q < Md4_N_USGM_USGN_d_subgroupsize; q++) + { + const uint siq = si + q * subgroup_size; + + if (Md4_N_USGM_USGN % subgroup_size == 0 || siq < Md4_N_USGM_USGN) + { + const uint zn = siq / (Md4 * N * UNROLL_SG_M); + const uint zmij = siq % (Md4 * N * UNROLL_SG_M); + const uint zm = zmij / (Md4 * N); + const uint ij = zmij % (Md4 * N); + const uint i = ij / Md4; + const uint j = ij % Md4; + + const uint gn = (ni + zn) * N + i; + const uint gm = (mi + zm) * Md4 + j; + + if (gn < psc(num_output) && gm * 4 < size) + { + const ivec4 sumi = tmp_o[sgi][siq]; + const int gn4 = int(gn % 4); + vec4 sumfp = vec4(sumi) * weight_descales_data[gn / 4][gn4]; + + if (bias_term == 1) + { + sumfp += bias_data[gn / 4][gn4]; + } + + sumfp = vec4(activation_afpvec4(afpvec4(sumfp), activation_type, activation_param_0, activation_param_1)); + + if (use_int8_requantize == 1) + { + const float top_scale = float(buffer_ld1(top_scales_data, 0)); + sumfp *= top_scale; + i8buffer_st4(top_blob_int8_data, gn * outcstepd4 + gm, float2int8vec4(sumfp)); + } + else + { + buffer_st4(top_blob_data, gn * outcstepd4 + gm, afpvec4(sumfp)); + } + } + } + } + } + else // out_elempack == 4 + { + const uint M_Nd4_USGM_USGN = M * Nd4 * UNROLL_SG_M * UNROLL_SG_N; + const uint M_Nd4_USGM_USGN_d_subgroupsize = (M_Nd4_USGM_USGN + subgroup_size - 1) / subgroup_size; + [[unroll]] for (uint q = 0; q < M_Nd4_USGM_USGN_d_subgroupsize; q++) + { + const uint siq = si + q * subgroup_size; + + if (M_Nd4_USGM_USGN % subgroup_size == 0 || siq < M_Nd4_USGM_USGN) + { + const uint zn = siq / (M * Nd4 * UNROLL_SG_M); + const uint zmij = siq % (M * Nd4 * UNROLL_SG_M); + const uint zm = zmij / (M * Nd4); + const uint ij = zmij % (M * Nd4); + const uint i = ij / Nd4; + const uint j = ij % Nd4; + + const uint gm = (mi + zm) * M + i; + const uint gn = (ni + zn) * N + j * 4; + + if (gm < size && gn < psc(num_output)) + { + const ivec4 sumi = tmp_o[sgi][siq]; + vec4 sumfp = vec4(sumi) * weight_descales_data[gn / 4]; + + if (bias_term == 1) + { + sumfp += bias_data[gn / 4]; + } + + sumfp = vec4(activation_afpvec4(afpvec4(sumfp), activation_type, activation_param_0, activation_param_1)); + + if (use_int8_requantize == 1) + { + const float top_scale = float(buffer_ld1(top_scales_data, 0)); + sumfp *= top_scale; + i8buffer_st4(top_blob_int8_data, (gn / 4) * psc(outcstep) + gm, float2int8vec4(sumfp)); + } + else + { + buffer_st4(top_blob_data, (gn / 4) * psc(outcstep) + gm, afpvec4(sumfp)); + } + } + } + } + } +} diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_input_int8.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_input_int8.comp new file mode 100644 index 000000000000..6ac6d0563a18 --- /dev/null +++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_input_int8.comp @@ -0,0 +1,133 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout(constant_id = 0) const int c = 0; + +#define shape_constant_id_offset 1 +layout(constant_id = shape_constant_id_offset + 0) const int w = 0; +layout(constant_id = shape_constant_id_offset + 1) const int h = 0; +layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0; + +layout(constant_id = shape_constant_id_offset + 3) const int outcstep = 0; + +layout(constant_id = shape_constant_id_offset + 4) const int block_x = 0; +layout(constant_id = shape_constant_id_offset + 5) const int block_y = 0; + +layout(binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_int8_data[]; }; +layout(binding = 1) writeonly buffer bottom_tm_blob { sint16vec4 bottom_tm_blob_data[]; }; + +layout(push_constant) uniform parameter +{ + int w; + int h; + int cstep; + + int outcstep; + + int block_x; + int block_y; + + int c; +} p; + +void main() +{ + const int gx = int(gl_GlobalInvocationID.x); + const int gy = int(gl_GlobalInvocationID.y); + const int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= psc(c)) + return; + + // load 4x4 + int sx = gx * 2; + int sy = gy * 2; + + int v_offset_0 = gz * psc(cstep) + sy * psc(w) + sx; + ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * psc(w); + + ivec4 v00 = i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 0); + ivec4 v01 = sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 1) : ivec4(0); + ivec4 v02 = sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 2) : ivec4(0); + ivec4 v03 = sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 3) : ivec4(0); + + ivec4 v10 = sy + 1 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 0) : ivec4(0); + ivec4 v11 = sy + 1 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 1) : ivec4(0); + ivec4 v12 = sy + 1 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 2) : ivec4(0); + ivec4 v13 = sy + 1 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 3) : ivec4(0); + + ivec4 v20 = sy + 2 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 0) : ivec4(0); + ivec4 v21 = sy + 2 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 1) : ivec4(0); + ivec4 v22 = sy + 2 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 2) : ivec4(0); + ivec4 v23 = sy + 2 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 3) : ivec4(0); + + ivec4 v30 = sy + 3 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 0) : ivec4(0); + ivec4 v31 = sy + 3 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 1) : ivec4(0); + ivec4 v32 = sy + 3 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 2) : ivec4(0); + ivec4 v33 = sy + 3 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 3) : ivec4(0); + + int tile = gy * psc(block_x) + gx; + + ivec4 m00 = v00 - v02; + ivec4 m01 = v10 - v12; + ivec4 m02 = v20 - v22; + ivec4 m03 = v30 - v32; + + ivec4 m10 = v01 + v02; + ivec4 m11 = v11 + v12; + ivec4 m12 = v21 + v22; + ivec4 m13 = v31 + v32; + + ivec4 m20 = v02 - v01; + ivec4 m21 = v12 - v11; + ivec4 m22 = v22 - v21; + ivec4 m23 = v32 - v31; + + ivec4 m30 = v03 - v01; + ivec4 m31 = v13 - v11; + ivec4 m32 = v23 - v21; + ivec4 m33 = v33 - v31; + + v00 = m00 - m02; + v10 = m10 - m12; + v20 = m20 - m22; + v30 = m30 - m32; + + v01 = m01 + m02; + v11 = m11 + m12; + v21 = m21 + m22; + v31 = m31 + m32; + + v02 = m02 - m01; + v12 = m12 - m11; + v22 = m22 - m21; + v32 = m32 - m31; + + v03 = m03 - m01; + v13 = m13 - m11; + v23 = m23 - m21; + v33 = m33 - m31; + + // store 16 + int v_tm_offset = gz * psc(outcstep) + tile; + int v_tm_step = psc(outcstep) * psc(c); + + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 0 * v_tm_step, v00); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 1 * v_tm_step, v01); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 2 * v_tm_step, v02); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 3 * v_tm_step, v03); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 4 * v_tm_step, v10); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 5 * v_tm_step, v11); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 6 * v_tm_step, v12); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 7 * v_tm_step, v13); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 8 * v_tm_step, v20); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 9 * v_tm_step, v21); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 10 * v_tm_step, v22); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 11 * v_tm_step, v23); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 12 * v_tm_step, v30); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 13 * v_tm_step, v31); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 14 * v_tm_step, v32); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 15 * v_tm_step, v33); +} diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_input_int8_cm.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_input_int8_cm.comp new file mode 100644 index 000000000000..b756d7862912 --- /dev/null +++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_input_int8_cm.comp @@ -0,0 +1,182 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout(constant_id = 0) const int c = 0; + +#define shape_constant_id_offset 1 +layout(constant_id = shape_constant_id_offset + 0) const int w = 0; +layout(constant_id = shape_constant_id_offset + 1) const int h = 0; +layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0; + +layout(constant_id = shape_constant_id_offset + 3) const int outcstep = 0; + +layout(constant_id = shape_constant_id_offset + 4) const int block_x = 0; +layout(constant_id = shape_constant_id_offset + 5) const int block_y = 0; + +layout(binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_int8_data[]; }; +layout(binding = 1) writeonly buffer bottom_tm_low_high_blob { ivec2 bottom_tm_low_high_data[]; }; + +layout(push_constant) uniform parameter +{ + int w; + int h; + int cstep; + + int outcstep; + + int block_x; + int block_y; + + int c; +} p; + +void main() +{ + const int gx = int(gl_GlobalInvocationID.x); + const int gy = int(gl_GlobalInvocationID.y); + const int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= psc(c)) + return; + + // load 4x4 + int sx = gx * 2; + int sy = gy * 2; + + int v_offset_0 = gz * psc(cstep) + sy * psc(w) + sx; + ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * psc(w); + + ivec4 v00 = i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 0); + ivec4 v01 = sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 1) : ivec4(0); + ivec4 v02 = sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 2) : ivec4(0); + ivec4 v03 = sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 3) : ivec4(0); + + ivec4 v10 = sy + 1 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 0) : ivec4(0); + ivec4 v11 = sy + 1 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 1) : ivec4(0); + ivec4 v12 = sy + 1 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 2) : ivec4(0); + ivec4 v13 = sy + 1 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 3) : ivec4(0); + + ivec4 v20 = sy + 2 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 0) : ivec4(0); + ivec4 v21 = sy + 2 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 1) : ivec4(0); + ivec4 v22 = sy + 2 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 2) : ivec4(0); + ivec4 v23 = sy + 2 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 3) : ivec4(0); + + ivec4 v30 = sy + 3 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 0) : ivec4(0); + ivec4 v31 = sy + 3 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 1) : ivec4(0); + ivec4 v32 = sy + 3 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 2) : ivec4(0); + ivec4 v33 = sy + 3 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 3) : ivec4(0); + + int tile = gy * psc(block_x) + gx; + + ivec4 m00 = v00 - v02; + ivec4 m01 = v10 - v12; + ivec4 m02 = v20 - v22; + ivec4 m03 = v30 - v32; + + ivec4 m10 = v01 + v02; + ivec4 m11 = v11 + v12; + ivec4 m12 = v21 + v22; + ivec4 m13 = v31 + v32; + + ivec4 m20 = v02 - v01; + ivec4 m21 = v12 - v11; + ivec4 m22 = v22 - v21; + ivec4 m23 = v32 - v31; + + ivec4 m30 = v03 - v01; + ivec4 m31 = v13 - v11; + ivec4 m32 = v23 - v21; + ivec4 m33 = v33 - v31; + + v00 = m00 - m02; + v10 = m10 - m12; + v20 = m20 - m22; + v30 = m30 - m32; + + v01 = m01 + m02; + v11 = m11 + m12; + v21 = m21 + m22; + v31 = m31 + m32; + + v02 = m02 - m01; + v12 = m12 - m11; + v22 = m22 - m21; + v32 = m32 - m31; + + v03 = m03 - m01; + v13 = m13 - m11; + v23 = m23 - m21; + v33 = m33 - m31; + + // store 16 + const int outcstepd2 = psc(outcstep) / 2; + int v_tm_offset = gz * outcstepd2 + tile; + int v_tm_step = outcstepd2 * psc(c); + + ivec4 v00_low = v00 & ivec4(255); + v00_low = mix(v00_low, v00_low - ivec4(256), greaterThanEqual(v00_low, ivec4(128))); + ivec4 v00_high = (v00 - v00_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 0 * v_tm_step] = ivec2(packInt4x8(v00_low), packInt4x8(v00_high)); + ivec4 v01_low = v01 & ivec4(255); + v01_low = mix(v01_low, v01_low - ivec4(256), greaterThanEqual(v01_low, ivec4(128))); + ivec4 v01_high = (v01 - v01_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 1 * v_tm_step] = ivec2(packInt4x8(v01_low), packInt4x8(v01_high)); + ivec4 v02_low = v02 & ivec4(255); + v02_low = mix(v02_low, v02_low - ivec4(256), greaterThanEqual(v02_low, ivec4(128))); + ivec4 v02_high = (v02 - v02_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 2 * v_tm_step] = ivec2(packInt4x8(v02_low), packInt4x8(v02_high)); + ivec4 v03_low = v03 & ivec4(255); + v03_low = mix(v03_low, v03_low - ivec4(256), greaterThanEqual(v03_low, ivec4(128))); + ivec4 v03_high = (v03 - v03_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 3 * v_tm_step] = ivec2(packInt4x8(v03_low), packInt4x8(v03_high)); + ivec4 v10_low = v10 & ivec4(255); + v10_low = mix(v10_low, v10_low - ivec4(256), greaterThanEqual(v10_low, ivec4(128))); + ivec4 v10_high = (v10 - v10_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 4 * v_tm_step] = ivec2(packInt4x8(v10_low), packInt4x8(v10_high)); + ivec4 v11_low = v11 & ivec4(255); + v11_low = mix(v11_low, v11_low - ivec4(256), greaterThanEqual(v11_low, ivec4(128))); + ivec4 v11_high = (v11 - v11_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 5 * v_tm_step] = ivec2(packInt4x8(v11_low), packInt4x8(v11_high)); + ivec4 v12_low = v12 & ivec4(255); + v12_low = mix(v12_low, v12_low - ivec4(256), greaterThanEqual(v12_low, ivec4(128))); + ivec4 v12_high = (v12 - v12_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 6 * v_tm_step] = ivec2(packInt4x8(v12_low), packInt4x8(v12_high)); + ivec4 v13_low = v13 & ivec4(255); + v13_low = mix(v13_low, v13_low - ivec4(256), greaterThanEqual(v13_low, ivec4(128))); + ivec4 v13_high = (v13 - v13_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 7 * v_tm_step] = ivec2(packInt4x8(v13_low), packInt4x8(v13_high)); + ivec4 v20_low = v20 & ivec4(255); + v20_low = mix(v20_low, v20_low - ivec4(256), greaterThanEqual(v20_low, ivec4(128))); + ivec4 v20_high = (v20 - v20_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 8 * v_tm_step] = ivec2(packInt4x8(v20_low), packInt4x8(v20_high)); + ivec4 v21_low = v21 & ivec4(255); + v21_low = mix(v21_low, v21_low - ivec4(256), greaterThanEqual(v21_low, ivec4(128))); + ivec4 v21_high = (v21 - v21_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 9 * v_tm_step] = ivec2(packInt4x8(v21_low), packInt4x8(v21_high)); + ivec4 v22_low = v22 & ivec4(255); + v22_low = mix(v22_low, v22_low - ivec4(256), greaterThanEqual(v22_low, ivec4(128))); + ivec4 v22_high = (v22 - v22_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 10 * v_tm_step] = ivec2(packInt4x8(v22_low), packInt4x8(v22_high)); + ivec4 v23_low = v23 & ivec4(255); + v23_low = mix(v23_low, v23_low - ivec4(256), greaterThanEqual(v23_low, ivec4(128))); + ivec4 v23_high = (v23 - v23_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 11 * v_tm_step] = ivec2(packInt4x8(v23_low), packInt4x8(v23_high)); + ivec4 v30_low = v30 & ivec4(255); + v30_low = mix(v30_low, v30_low - ivec4(256), greaterThanEqual(v30_low, ivec4(128))); + ivec4 v30_high = (v30 - v30_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 12 * v_tm_step] = ivec2(packInt4x8(v30_low), packInt4x8(v30_high)); + ivec4 v31_low = v31 & ivec4(255); + v31_low = mix(v31_low, v31_low - ivec4(256), greaterThanEqual(v31_low, ivec4(128))); + ivec4 v31_high = (v31 - v31_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 13 * v_tm_step] = ivec2(packInt4x8(v31_low), packInt4x8(v31_high)); + ivec4 v32_low = v32 & ivec4(255); + v32_low = mix(v32_low, v32_low - ivec4(256), greaterThanEqual(v32_low, ivec4(128))); + ivec4 v32_high = (v32 - v32_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 14 * v_tm_step] = ivec2(packInt4x8(v32_low), packInt4x8(v32_high)); + ivec4 v33_low = v33 & ivec4(255); + v33_low = mix(v33_low, v33_low - ivec4(256), greaterThanEqual(v33_low, ivec4(128))); + ivec4 v33_high = (v33 - v33_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 15 * v_tm_step] = ivec2(packInt4x8(v33_low), packInt4x8(v33_high)); +} diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_output_int8.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_output_int8.comp new file mode 100644 index 000000000000..3170fb28473c --- /dev/null +++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_output_int8.comp @@ -0,0 +1,142 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#extension GL_GOOGLE_include_directive : enable +#include "vulkan_activation.comp" + +layout(constant_id = 0) const int bias_term = 0; +layout(constant_id = 1) const int activation_type = 0; +layout(constant_id = 2) const float activation_param_0 = 0; +layout(constant_id = 3) const float activation_param_1 = 0; +layout(constant_id = 4) const int use_int8_requantize = 0; +layout(binding = 0) readonly buffer top_tm_blob { ivec4 top_tm_blob_data[]; }; +layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout(binding = 2) readonly buffer bias_blob { vec4 bias_data[]; }; +layout(binding = 3) readonly buffer weight_descales_blob { vec4 weight_descales_data[]; }; +layout(binding = 4) readonly buffer top_scales_blob { sfp top_scales_data[]; }; +layout(binding = 5) writeonly buffer top_blob_int8 { sint8vec4 top_blob_int8_data[]; }; + +layout(push_constant) uniform parameter +{ + int cstep; + + int block_x; + int block_y; + + int outw; + int outh; + int outcstep; + int outc; +} p; + +void main() +{ + const int gx = int(gl_GlobalInvocationID.x); + const int gy = int(gl_GlobalInvocationID.y); + const int gz = int(gl_GlobalInvocationID.z); + + const int outc4 = (p.outc + 3) / 4; + + if (gx >= p.block_x || gy >= p.block_y || gz >= outc4) + return; + + // load 16 + int v_tm_offset = gz * p.cstep + gy * p.block_x + gx; + int v_tm_step = p.cstep * outc4; + + ivec4 v00 = top_tm_blob_data[v_tm_offset + 0 * v_tm_step]; + ivec4 v01 = top_tm_blob_data[v_tm_offset + 1 * v_tm_step]; + ivec4 v02 = top_tm_blob_data[v_tm_offset + 2 * v_tm_step]; + ivec4 v03 = top_tm_blob_data[v_tm_offset + 3 * v_tm_step]; + ivec4 v10 = top_tm_blob_data[v_tm_offset + 4 * v_tm_step]; + ivec4 v11 = top_tm_blob_data[v_tm_offset + 5 * v_tm_step]; + ivec4 v12 = top_tm_blob_data[v_tm_offset + 6 * v_tm_step]; + ivec4 v13 = top_tm_blob_data[v_tm_offset + 7 * v_tm_step]; + ivec4 v20 = top_tm_blob_data[v_tm_offset + 8 * v_tm_step]; + ivec4 v21 = top_tm_blob_data[v_tm_offset + 9 * v_tm_step]; + ivec4 v22 = top_tm_blob_data[v_tm_offset + 10 * v_tm_step]; + ivec4 v23 = top_tm_blob_data[v_tm_offset + 11 * v_tm_step]; + ivec4 v30 = top_tm_blob_data[v_tm_offset + 12 * v_tm_step]; + ivec4 v31 = top_tm_blob_data[v_tm_offset + 13 * v_tm_step]; + ivec4 v32 = top_tm_blob_data[v_tm_offset + 14 * v_tm_step]; + ivec4 v33 = top_tm_blob_data[v_tm_offset + 15 * v_tm_step]; + + ivec4 m00 = v00 + v01 + v02; + ivec4 m01 = v10 + v11 + v12; + ivec4 m02 = v20 + v21 + v22; + ivec4 m03 = v30 + v31 + v32; + + ivec4 m10 = v01 - v02 + v03; + ivec4 m11 = v11 - v12 + v13; + ivec4 m12 = v21 - v22 + v23; + ivec4 m13 = v31 - v32 + v33; + + const vec4 descale = vec4(0.25f) * weight_descales_data[gz]; + + vec4 out00 = vec4(m00 + m01 + m02) * descale; + vec4 out01 = vec4(m01 - m02 + m03) * descale; + vec4 out10 = vec4(m10 + m11 + m12) * descale; + vec4 out11 = vec4(m11 - m12 + m13) * descale; + + if (bias_term == 1) + { + const vec4 bias_value = bias_data[gz]; + + out00 += bias_value; + out01 += bias_value; + out10 += bias_value; + out11 += bias_value; + } + + out00 = vec4(activation_afpvec4(afpvec4(out00), activation_type, activation_param_0, activation_param_1)); + out01 = vec4(activation_afpvec4(afpvec4(out01), activation_type, activation_param_0, activation_param_1)); + out10 = vec4(activation_afpvec4(afpvec4(out10), activation_type, activation_param_0, activation_param_1)); + out11 = vec4(activation_afpvec4(afpvec4(out11), activation_type, activation_param_0, activation_param_1)); + + // store 2x2 + int x = gx * 2; + int y = gy * 2; + + if (use_int8_requantize == 1) + { + const float top_scale = buffer_ld1(top_scales_data, 0); + + out00 *= top_scale; + out01 *= top_scale; + out10 *= top_scale; + out11 *= top_scale; + + ivec4 out00_int8 = float2int8vec4(out00); + ivec4 out01_int8 = float2int8vec4(out01); + ivec4 out10_int8 = float2int8vec4(out10); + ivec4 out11_int8 = float2int8vec4(out11); + + int v_offset_0 = gz * p.outcstep + y * p.outw + x; + int v_offset_1 = v_offset_0 + p.outw; + + i8buffer_st4(top_blob_int8_data, v_offset_0 + 0, out00_int8); + if (x + 1 < p.outw) i8buffer_st4(top_blob_int8_data, v_offset_0 + 1, out01_int8); + + if (y + 1 < p.outh) + { + i8buffer_st4(top_blob_int8_data, v_offset_1 + 0, out10_int8); + if (x + 1 < p.outw) i8buffer_st4(top_blob_int8_data, v_offset_1 + 1, out11_int8); + } + } + else + { + int v_offset_0 = gz * p.outcstep + y * p.outw + x; + int v_offset_1 = v_offset_0 + p.outw; + + buffer_st4(top_blob_data, v_offset_0 + 0, afpvec4(out00)); + if (x + 1 < p.outw) buffer_st4(top_blob_data, v_offset_0 + 1, afpvec4(out01)); + + if (y + 1 < p.outh) + { + buffer_st4(top_blob_data, v_offset_1 + 0, afpvec4(out10)); + if (x + 1 < p.outw) buffer_st4(top_blob_data, v_offset_1 + 1, afpvec4(out11)); + } + } +} diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_input_int8.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_input_int8.comp new file mode 100644 index 000000000000..191427040895 --- /dev/null +++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_input_int8.comp @@ -0,0 +1,220 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout(constant_id = 0) const int c = 0; + +#define shape_constant_id_offset 1 +layout(constant_id = shape_constant_id_offset + 0) const int w = 0; +layout(constant_id = shape_constant_id_offset + 1) const int h = 0; +layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0; + +layout(constant_id = shape_constant_id_offset + 3) const int outcstep = 0; + +layout(constant_id = shape_constant_id_offset + 4) const int block_x = 0; +layout(constant_id = shape_constant_id_offset + 5) const int block_y = 0; + +layout(binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_int8_data[]; }; +layout(binding = 1) writeonly buffer bottom_tm_blob { sint16vec4 bottom_tm_blob_data[]; }; + +layout(push_constant) uniform parameter +{ + int w; + int h; + int cstep; + + int outcstep; + + int block_x; + int block_y; + + int c; +} p; + +void main() +{ + const int gx = int(gl_GlobalInvocationID.x); + const int gy = int(gl_GlobalInvocationID.y); + const int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= psc(c)) + return; + + // load 6x6 + int sx = gx * 4; + int sy = gy * 4; + + int v_offset_0 = gz * psc(cstep) + sy * psc(w) + sx; + ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * psc(w); + ivec2 v_offset45 = v_offset_0 + ivec2(4, 5) * psc(w); + + ivec4 v00 = i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 0); + ivec4 v01 = sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 1) : ivec4(0); + ivec4 v02 = sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 2) : ivec4(0); + ivec4 v03 = sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 3) : ivec4(0); + ivec4 v04 = sx + 4 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 4) : ivec4(0); + ivec4 v05 = sx + 5 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 5) : ivec4(0); + + ivec4 v10 = sy + 1 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 0) : ivec4(0); + ivec4 v11 = sy + 1 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 1) : ivec4(0); + ivec4 v12 = sy + 1 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 2) : ivec4(0); + ivec4 v13 = sy + 1 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 3) : ivec4(0); + ivec4 v14 = sy + 1 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 4) : ivec4(0); + ivec4 v15 = sy + 1 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 5) : ivec4(0); + + ivec4 v20 = sy + 2 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 0) : ivec4(0); + ivec4 v21 = sy + 2 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 1) : ivec4(0); + ivec4 v22 = sy + 2 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 2) : ivec4(0); + ivec4 v23 = sy + 2 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 3) : ivec4(0); + ivec4 v24 = sy + 2 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 4) : ivec4(0); + ivec4 v25 = sy + 2 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 5) : ivec4(0); + + ivec4 v30 = sy + 3 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 0) : ivec4(0); + ivec4 v31 = sy + 3 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 1) : ivec4(0); + ivec4 v32 = sy + 3 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 2) : ivec4(0); + ivec4 v33 = sy + 3 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 3) : ivec4(0); + ivec4 v34 = sy + 3 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 4) : ivec4(0); + ivec4 v35 = sy + 3 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 5) : ivec4(0); + + ivec4 v40 = sy + 4 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.x + 0) : ivec4(0); + ivec4 v41 = sy + 4 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.x + 1) : ivec4(0); + ivec4 v42 = sy + 4 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.x + 2) : ivec4(0); + ivec4 v43 = sy + 4 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.x + 3) : ivec4(0); + ivec4 v44 = sy + 4 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.x + 4) : ivec4(0); + ivec4 v45 = sy + 4 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.x + 5) : ivec4(0); + + ivec4 v50 = sy + 5 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.y + 0) : ivec4(0); + ivec4 v51 = sy + 5 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.y + 1) : ivec4(0); + ivec4 v52 = sy + 5 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.y + 2) : ivec4(0); + ivec4 v53 = sy + 5 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.y + 3) : ivec4(0); + ivec4 v54 = sy + 5 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.y + 4) : ivec4(0); + ivec4 v55 = sy + 5 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.y + 5) : ivec4(0); + + int tile = gy * psc(block_x) + gx; + + ivec4 m00 = v04 + v00 * 4 - v02 * 5; + ivec4 m01 = v14 + v10 * 4 - v12 * 5; + ivec4 m02 = v24 + v20 * 4 - v22 * 5; + ivec4 m03 = v34 + v30 * 4 - v32 * 5; + ivec4 m04 = v44 + v40 * 4 - v42 * 5; + ivec4 m05 = v54 + v50 * 4 - v52 * 5; + + ivec4 m10 = (v04 - v02 * 4) + (v03 - v01 * 4); + ivec4 m11 = (v14 - v12 * 4) + (v13 - v11 * 4); + ivec4 m12 = (v24 - v22 * 4) + (v23 - v21 * 4); + ivec4 m13 = (v34 - v32 * 4) + (v33 - v31 * 4); + ivec4 m14 = (v44 - v42 * 4) + (v43 - v41 * 4); + ivec4 m15 = (v54 - v52 * 4) + (v53 - v51 * 4); + + ivec4 m20 = (v04 - v02 * 4) - (v03 - v01 * 4); + ivec4 m21 = (v14 - v12 * 4) - (v13 - v11 * 4); + ivec4 m22 = (v24 - v22 * 4) - (v23 - v21 * 4); + ivec4 m23 = (v34 - v32 * 4) - (v33 - v31 * 4); + ivec4 m24 = (v44 - v42 * 4) - (v43 - v41 * 4); + ivec4 m25 = (v54 - v52 * 4) - (v53 - v51 * 4); + + ivec4 m30 = (v04 - v02) + (v03 - v01) * 2; + ivec4 m31 = (v14 - v12) + (v13 - v11) * 2; + ivec4 m32 = (v24 - v22) + (v23 - v21) * 2; + ivec4 m33 = (v34 - v32) + (v33 - v31) * 2; + ivec4 m34 = (v44 - v42) + (v43 - v41) * 2; + ivec4 m35 = (v54 - v52) + (v53 - v51) * 2; + + ivec4 m40 = (v04 - v02) - (v03 - v01) * 2; + ivec4 m41 = (v14 - v12) - (v13 - v11) * 2; + ivec4 m42 = (v24 - v22) - (v23 - v21) * 2; + ivec4 m43 = (v34 - v32) - (v33 - v31) * 2; + ivec4 m44 = (v44 - v42) - (v43 - v41) * 2; + ivec4 m45 = (v54 - v52) - (v53 - v51) * 2; + + ivec4 m50 = v05 + v01 * 4 - v03 * 5; + ivec4 m51 = v15 + v11 * 4 - v13 * 5; + ivec4 m52 = v25 + v21 * 4 - v23 * 5; + ivec4 m53 = v35 + v31 * 4 - v33 * 5; + ivec4 m54 = v45 + v41 * 4 - v43 * 5; + ivec4 m55 = v55 + v51 * 4 - v53 * 5; + + v00 = m04 + m00 * 4 - m02 * 5; + v10 = m14 + m10 * 4 - m12 * 5; + v20 = m24 + m20 * 4 - m22 * 5; + v30 = m34 + m30 * 4 - m32 * 5; + v40 = m44 + m40 * 4 - m42 * 5; + v50 = m54 + m50 * 4 - m52 * 5; + + v01 = (m04 - m02 * 4) + (m03 - m01 * 4); + v11 = (m14 - m12 * 4) + (m13 - m11 * 4); + v21 = (m24 - m22 * 4) + (m23 - m21 * 4); + v31 = (m34 - m32 * 4) + (m33 - m31 * 4); + v41 = (m44 - m42 * 4) + (m43 - m41 * 4); + v51 = (m54 - m52 * 4) + (m53 - m51 * 4); + + v02 = (m04 - m02 * 4) - (m03 - m01 * 4); + v12 = (m14 - m12 * 4) - (m13 - m11 * 4); + v22 = (m24 - m22 * 4) - (m23 - m21 * 4); + v32 = (m34 - m32 * 4) - (m33 - m31 * 4); + v42 = (m44 - m42 * 4) - (m43 - m41 * 4); + v52 = (m54 - m52 * 4) - (m53 - m51 * 4); + + v03 = (m04 - m02) + (m03 - m01) * 2; + v13 = (m14 - m12) + (m13 - m11) * 2; + v23 = (m24 - m22) + (m23 - m21) * 2; + v33 = (m34 - m32) + (m33 - m31) * 2; + v43 = (m44 - m42) + (m43 - m41) * 2; + v53 = (m54 - m52) + (m53 - m51) * 2; + + v04 = (m04 - m02) - (m03 - m01) * 2; + v14 = (m14 - m12) - (m13 - m11) * 2; + v24 = (m24 - m22) - (m23 - m21) * 2; + v34 = (m34 - m32) - (m33 - m31) * 2; + v44 = (m44 - m42) - (m43 - m41) * 2; + v54 = (m54 - m52) - (m53 - m51) * 2; + + v05 = m05 + m01 * 4 - m03 * 5; + v15 = m15 + m11 * 4 - m13 * 5; + v25 = m25 + m21 * 4 - m23 * 5; + v35 = m35 + m31 * 4 - m33 * 5; + v45 = m45 + m41 * 4 - m43 * 5; + v55 = m55 + m51 * 4 - m53 * 5; + + // store 36 + int v_tm_offset = gz * psc(outcstep) + tile; + int v_tm_step = psc(outcstep) * psc(c); + + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 0 * v_tm_step, v00); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 1 * v_tm_step, v01); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 2 * v_tm_step, v02); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 3 * v_tm_step, v03); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 4 * v_tm_step, v04); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 5 * v_tm_step, v05); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 6 * v_tm_step, v10); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 7 * v_tm_step, v11); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 8 * v_tm_step, v12); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 9 * v_tm_step, v13); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 10 * v_tm_step, v14); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 11 * v_tm_step, v15); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 12 * v_tm_step, v20); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 13 * v_tm_step, v21); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 14 * v_tm_step, v22); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 15 * v_tm_step, v23); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 16 * v_tm_step, v24); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 17 * v_tm_step, v25); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 18 * v_tm_step, v30); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 19 * v_tm_step, v31); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 20 * v_tm_step, v32); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 21 * v_tm_step, v33); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 22 * v_tm_step, v34); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 23 * v_tm_step, v35); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 24 * v_tm_step, v40); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 25 * v_tm_step, v41); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 26 * v_tm_step, v42); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 27 * v_tm_step, v43); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 28 * v_tm_step, v44); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 29 * v_tm_step, v45); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 30 * v_tm_step, v50); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 31 * v_tm_step, v51); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 32 * v_tm_step, v52); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 33 * v_tm_step, v53); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 34 * v_tm_step, v54); + i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 35 * v_tm_step, v55); +} diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_input_int8_cm.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_input_int8_cm.comp new file mode 100644 index 000000000000..d2a1d70895f4 --- /dev/null +++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_input_int8_cm.comp @@ -0,0 +1,329 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout(constant_id = 0) const int c = 0; + +#define shape_constant_id_offset 1 +layout(constant_id = shape_constant_id_offset + 0) const int w = 0; +layout(constant_id = shape_constant_id_offset + 1) const int h = 0; +layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0; + +layout(constant_id = shape_constant_id_offset + 3) const int outcstep = 0; + +layout(constant_id = shape_constant_id_offset + 4) const int block_x = 0; +layout(constant_id = shape_constant_id_offset + 5) const int block_y = 0; + +layout(binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_int8_data[]; }; +layout(binding = 1) writeonly buffer bottom_tm_low_high_blob { ivec2 bottom_tm_low_high_data[]; }; + +layout(push_constant) uniform parameter +{ + int w; + int h; + int cstep; + + int outcstep; + + int block_x; + int block_y; + + int c; +} p; + +void main() +{ + const int gx = int(gl_GlobalInvocationID.x); + const int gy = int(gl_GlobalInvocationID.y); + const int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= psc(c)) + return; + + // load 6x6 + int sx = gx * 4; + int sy = gy * 4; + + int v_offset_0 = gz * psc(cstep) + sy * psc(w) + sx; + ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * psc(w); + ivec2 v_offset45 = v_offset_0 + ivec2(4, 5) * psc(w); + + ivec4 v00 = i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 0); + ivec4 v01 = sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 1) : ivec4(0); + ivec4 v02 = sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 2) : ivec4(0); + ivec4 v03 = sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 3) : ivec4(0); + ivec4 v04 = sx + 4 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 4) : ivec4(0); + ivec4 v05 = sx + 5 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 5) : ivec4(0); + + ivec4 v10 = sy + 1 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 0) : ivec4(0); + ivec4 v11 = sy + 1 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 1) : ivec4(0); + ivec4 v12 = sy + 1 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 2) : ivec4(0); + ivec4 v13 = sy + 1 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 3) : ivec4(0); + ivec4 v14 = sy + 1 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 4) : ivec4(0); + ivec4 v15 = sy + 1 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 5) : ivec4(0); + + ivec4 v20 = sy + 2 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 0) : ivec4(0); + ivec4 v21 = sy + 2 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 1) : ivec4(0); + ivec4 v22 = sy + 2 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 2) : ivec4(0); + ivec4 v23 = sy + 2 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 3) : ivec4(0); + ivec4 v24 = sy + 2 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 4) : ivec4(0); + ivec4 v25 = sy + 2 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 5) : ivec4(0); + + ivec4 v30 = sy + 3 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 0) : ivec4(0); + ivec4 v31 = sy + 3 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 1) : ivec4(0); + ivec4 v32 = sy + 3 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 2) : ivec4(0); + ivec4 v33 = sy + 3 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 3) : ivec4(0); + ivec4 v34 = sy + 3 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 4) : ivec4(0); + ivec4 v35 = sy + 3 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 5) : ivec4(0); + + ivec4 v40 = sy + 4 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.x + 0) : ivec4(0); + ivec4 v41 = sy + 4 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.x + 1) : ivec4(0); + ivec4 v42 = sy + 4 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.x + 2) : ivec4(0); + ivec4 v43 = sy + 4 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.x + 3) : ivec4(0); + ivec4 v44 = sy + 4 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.x + 4) : ivec4(0); + ivec4 v45 = sy + 4 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.x + 5) : ivec4(0); + + ivec4 v50 = sy + 5 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.y + 0) : ivec4(0); + ivec4 v51 = sy + 5 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.y + 1) : ivec4(0); + ivec4 v52 = sy + 5 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.y + 2) : ivec4(0); + ivec4 v53 = sy + 5 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.y + 3) : ivec4(0); + ivec4 v54 = sy + 5 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.y + 4) : ivec4(0); + ivec4 v55 = sy + 5 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.y + 5) : ivec4(0); + + int tile = gy * psc(block_x) + gx; + + ivec4 m00 = v04 + v00 * 4 - v02 * 5; + ivec4 m01 = v14 + v10 * 4 - v12 * 5; + ivec4 m02 = v24 + v20 * 4 - v22 * 5; + ivec4 m03 = v34 + v30 * 4 - v32 * 5; + ivec4 m04 = v44 + v40 * 4 - v42 * 5; + ivec4 m05 = v54 + v50 * 4 - v52 * 5; + + ivec4 m10 = (v04 - v02 * 4) + (v03 - v01 * 4); + ivec4 m11 = (v14 - v12 * 4) + (v13 - v11 * 4); + ivec4 m12 = (v24 - v22 * 4) + (v23 - v21 * 4); + ivec4 m13 = (v34 - v32 * 4) + (v33 - v31 * 4); + ivec4 m14 = (v44 - v42 * 4) + (v43 - v41 * 4); + ivec4 m15 = (v54 - v52 * 4) + (v53 - v51 * 4); + + ivec4 m20 = (v04 - v02 * 4) - (v03 - v01 * 4); + ivec4 m21 = (v14 - v12 * 4) - (v13 - v11 * 4); + ivec4 m22 = (v24 - v22 * 4) - (v23 - v21 * 4); + ivec4 m23 = (v34 - v32 * 4) - (v33 - v31 * 4); + ivec4 m24 = (v44 - v42 * 4) - (v43 - v41 * 4); + ivec4 m25 = (v54 - v52 * 4) - (v53 - v51 * 4); + + ivec4 m30 = (v04 - v02) + (v03 - v01) * 2; + ivec4 m31 = (v14 - v12) + (v13 - v11) * 2; + ivec4 m32 = (v24 - v22) + (v23 - v21) * 2; + ivec4 m33 = (v34 - v32) + (v33 - v31) * 2; + ivec4 m34 = (v44 - v42) + (v43 - v41) * 2; + ivec4 m35 = (v54 - v52) + (v53 - v51) * 2; + + ivec4 m40 = (v04 - v02) - (v03 - v01) * 2; + ivec4 m41 = (v14 - v12) - (v13 - v11) * 2; + ivec4 m42 = (v24 - v22) - (v23 - v21) * 2; + ivec4 m43 = (v34 - v32) - (v33 - v31) * 2; + ivec4 m44 = (v44 - v42) - (v43 - v41) * 2; + ivec4 m45 = (v54 - v52) - (v53 - v51) * 2; + + ivec4 m50 = v05 + v01 * 4 - v03 * 5; + ivec4 m51 = v15 + v11 * 4 - v13 * 5; + ivec4 m52 = v25 + v21 * 4 - v23 * 5; + ivec4 m53 = v35 + v31 * 4 - v33 * 5; + ivec4 m54 = v45 + v41 * 4 - v43 * 5; + ivec4 m55 = v55 + v51 * 4 - v53 * 5; + + v00 = m04 + m00 * 4 - m02 * 5; + v10 = m14 + m10 * 4 - m12 * 5; + v20 = m24 + m20 * 4 - m22 * 5; + v30 = m34 + m30 * 4 - m32 * 5; + v40 = m44 + m40 * 4 - m42 * 5; + v50 = m54 + m50 * 4 - m52 * 5; + + v01 = (m04 - m02 * 4) + (m03 - m01 * 4); + v11 = (m14 - m12 * 4) + (m13 - m11 * 4); + v21 = (m24 - m22 * 4) + (m23 - m21 * 4); + v31 = (m34 - m32 * 4) + (m33 - m31 * 4); + v41 = (m44 - m42 * 4) + (m43 - m41 * 4); + v51 = (m54 - m52 * 4) + (m53 - m51 * 4); + + v02 = (m04 - m02 * 4) - (m03 - m01 * 4); + v12 = (m14 - m12 * 4) - (m13 - m11 * 4); + v22 = (m24 - m22 * 4) - (m23 - m21 * 4); + v32 = (m34 - m32 * 4) - (m33 - m31 * 4); + v42 = (m44 - m42 * 4) - (m43 - m41 * 4); + v52 = (m54 - m52 * 4) - (m53 - m51 * 4); + + v03 = (m04 - m02) + (m03 - m01) * 2; + v13 = (m14 - m12) + (m13 - m11) * 2; + v23 = (m24 - m22) + (m23 - m21) * 2; + v33 = (m34 - m32) + (m33 - m31) * 2; + v43 = (m44 - m42) + (m43 - m41) * 2; + v53 = (m54 - m52) + (m53 - m51) * 2; + + v04 = (m04 - m02) - (m03 - m01) * 2; + v14 = (m14 - m12) - (m13 - m11) * 2; + v24 = (m24 - m22) - (m23 - m21) * 2; + v34 = (m34 - m32) - (m33 - m31) * 2; + v44 = (m44 - m42) - (m43 - m41) * 2; + v54 = (m54 - m52) - (m53 - m51) * 2; + + v05 = m05 + m01 * 4 - m03 * 5; + v15 = m15 + m11 * 4 - m13 * 5; + v25 = m25 + m21 * 4 - m23 * 5; + v35 = m35 + m31 * 4 - m33 * 5; + v45 = m45 + m41 * 4 - m43 * 5; + v55 = m55 + m51 * 4 - m53 * 5; + + // store 36 + const int outcstepd2 = psc(outcstep) / 2; + int v_tm_offset = gz * outcstepd2 + tile; + int v_tm_step = outcstepd2 * psc(c); + + ivec4 v00_low = v00 & ivec4(255); + v00_low = mix(v00_low, v00_low - ivec4(256), greaterThanEqual(v00_low, ivec4(128))); + ivec4 v00_high = (v00 - v00_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 0 * v_tm_step] = ivec2(packInt4x8(v00_low), packInt4x8(v00_high)); + ivec4 v01_low = v01 & ivec4(255); + v01_low = mix(v01_low, v01_low - ivec4(256), greaterThanEqual(v01_low, ivec4(128))); + ivec4 v01_high = (v01 - v01_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 1 * v_tm_step] = ivec2(packInt4x8(v01_low), packInt4x8(v01_high)); + ivec4 v02_low = v02 & ivec4(255); + v02_low = mix(v02_low, v02_low - ivec4(256), greaterThanEqual(v02_low, ivec4(128))); + ivec4 v02_high = (v02 - v02_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 2 * v_tm_step] = ivec2(packInt4x8(v02_low), packInt4x8(v02_high)); + ivec4 v03_low = v03 & ivec4(255); + v03_low = mix(v03_low, v03_low - ivec4(256), greaterThanEqual(v03_low, ivec4(128))); + ivec4 v03_high = (v03 - v03_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 3 * v_tm_step] = ivec2(packInt4x8(v03_low), packInt4x8(v03_high)); + ivec4 v04_low = v04 & ivec4(255); + v04_low = mix(v04_low, v04_low - ivec4(256), greaterThanEqual(v04_low, ivec4(128))); + ivec4 v04_high = (v04 - v04_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 4 * v_tm_step] = ivec2(packInt4x8(v04_low), packInt4x8(v04_high)); + ivec4 v05_low = v05 & ivec4(255); + v05_low = mix(v05_low, v05_low - ivec4(256), greaterThanEqual(v05_low, ivec4(128))); + ivec4 v05_high = (v05 - v05_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 5 * v_tm_step] = ivec2(packInt4x8(v05_low), packInt4x8(v05_high)); + ivec4 v10_low = v10 & ivec4(255); + v10_low = mix(v10_low, v10_low - ivec4(256), greaterThanEqual(v10_low, ivec4(128))); + ivec4 v10_high = (v10 - v10_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 6 * v_tm_step] = ivec2(packInt4x8(v10_low), packInt4x8(v10_high)); + ivec4 v11_low = v11 & ivec4(255); + v11_low = mix(v11_low, v11_low - ivec4(256), greaterThanEqual(v11_low, ivec4(128))); + ivec4 v11_high = (v11 - v11_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 7 * v_tm_step] = ivec2(packInt4x8(v11_low), packInt4x8(v11_high)); + ivec4 v12_low = v12 & ivec4(255); + v12_low = mix(v12_low, v12_low - ivec4(256), greaterThanEqual(v12_low, ivec4(128))); + ivec4 v12_high = (v12 - v12_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 8 * v_tm_step] = ivec2(packInt4x8(v12_low), packInt4x8(v12_high)); + ivec4 v13_low = v13 & ivec4(255); + v13_low = mix(v13_low, v13_low - ivec4(256), greaterThanEqual(v13_low, ivec4(128))); + ivec4 v13_high = (v13 - v13_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 9 * v_tm_step] = ivec2(packInt4x8(v13_low), packInt4x8(v13_high)); + ivec4 v14_low = v14 & ivec4(255); + v14_low = mix(v14_low, v14_low - ivec4(256), greaterThanEqual(v14_low, ivec4(128))); + ivec4 v14_high = (v14 - v14_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 10 * v_tm_step] = ivec2(packInt4x8(v14_low), packInt4x8(v14_high)); + ivec4 v15_low = v15 & ivec4(255); + v15_low = mix(v15_low, v15_low - ivec4(256), greaterThanEqual(v15_low, ivec4(128))); + ivec4 v15_high = (v15 - v15_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 11 * v_tm_step] = ivec2(packInt4x8(v15_low), packInt4x8(v15_high)); + ivec4 v20_low = v20 & ivec4(255); + v20_low = mix(v20_low, v20_low - ivec4(256), greaterThanEqual(v20_low, ivec4(128))); + ivec4 v20_high = (v20 - v20_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 12 * v_tm_step] = ivec2(packInt4x8(v20_low), packInt4x8(v20_high)); + ivec4 v21_low = v21 & ivec4(255); + v21_low = mix(v21_low, v21_low - ivec4(256), greaterThanEqual(v21_low, ivec4(128))); + ivec4 v21_high = (v21 - v21_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 13 * v_tm_step] = ivec2(packInt4x8(v21_low), packInt4x8(v21_high)); + ivec4 v22_low = v22 & ivec4(255); + v22_low = mix(v22_low, v22_low - ivec4(256), greaterThanEqual(v22_low, ivec4(128))); + ivec4 v22_high = (v22 - v22_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 14 * v_tm_step] = ivec2(packInt4x8(v22_low), packInt4x8(v22_high)); + ivec4 v23_low = v23 & ivec4(255); + v23_low = mix(v23_low, v23_low - ivec4(256), greaterThanEqual(v23_low, ivec4(128))); + ivec4 v23_high = (v23 - v23_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 15 * v_tm_step] = ivec2(packInt4x8(v23_low), packInt4x8(v23_high)); + ivec4 v24_low = v24 & ivec4(255); + v24_low = mix(v24_low, v24_low - ivec4(256), greaterThanEqual(v24_low, ivec4(128))); + ivec4 v24_high = (v24 - v24_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 16 * v_tm_step] = ivec2(packInt4x8(v24_low), packInt4x8(v24_high)); + ivec4 v25_low = v25 & ivec4(255); + v25_low = mix(v25_low, v25_low - ivec4(256), greaterThanEqual(v25_low, ivec4(128))); + ivec4 v25_high = (v25 - v25_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 17 * v_tm_step] = ivec2(packInt4x8(v25_low), packInt4x8(v25_high)); + ivec4 v30_low = v30 & ivec4(255); + v30_low = mix(v30_low, v30_low - ivec4(256), greaterThanEqual(v30_low, ivec4(128))); + ivec4 v30_high = (v30 - v30_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 18 * v_tm_step] = ivec2(packInt4x8(v30_low), packInt4x8(v30_high)); + ivec4 v31_low = v31 & ivec4(255); + v31_low = mix(v31_low, v31_low - ivec4(256), greaterThanEqual(v31_low, ivec4(128))); + ivec4 v31_high = (v31 - v31_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 19 * v_tm_step] = ivec2(packInt4x8(v31_low), packInt4x8(v31_high)); + ivec4 v32_low = v32 & ivec4(255); + v32_low = mix(v32_low, v32_low - ivec4(256), greaterThanEqual(v32_low, ivec4(128))); + ivec4 v32_high = (v32 - v32_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 20 * v_tm_step] = ivec2(packInt4x8(v32_low), packInt4x8(v32_high)); + ivec4 v33_low = v33 & ivec4(255); + v33_low = mix(v33_low, v33_low - ivec4(256), greaterThanEqual(v33_low, ivec4(128))); + ivec4 v33_high = (v33 - v33_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 21 * v_tm_step] = ivec2(packInt4x8(v33_low), packInt4x8(v33_high)); + ivec4 v34_low = v34 & ivec4(255); + v34_low = mix(v34_low, v34_low - ivec4(256), greaterThanEqual(v34_low, ivec4(128))); + ivec4 v34_high = (v34 - v34_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 22 * v_tm_step] = ivec2(packInt4x8(v34_low), packInt4x8(v34_high)); + ivec4 v35_low = v35 & ivec4(255); + v35_low = mix(v35_low, v35_low - ivec4(256), greaterThanEqual(v35_low, ivec4(128))); + ivec4 v35_high = (v35 - v35_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 23 * v_tm_step] = ivec2(packInt4x8(v35_low), packInt4x8(v35_high)); + ivec4 v40_low = v40 & ivec4(255); + v40_low = mix(v40_low, v40_low - ivec4(256), greaterThanEqual(v40_low, ivec4(128))); + ivec4 v40_high = (v40 - v40_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 24 * v_tm_step] = ivec2(packInt4x8(v40_low), packInt4x8(v40_high)); + ivec4 v41_low = v41 & ivec4(255); + v41_low = mix(v41_low, v41_low - ivec4(256), greaterThanEqual(v41_low, ivec4(128))); + ivec4 v41_high = (v41 - v41_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 25 * v_tm_step] = ivec2(packInt4x8(v41_low), packInt4x8(v41_high)); + ivec4 v42_low = v42 & ivec4(255); + v42_low = mix(v42_low, v42_low - ivec4(256), greaterThanEqual(v42_low, ivec4(128))); + ivec4 v42_high = (v42 - v42_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 26 * v_tm_step] = ivec2(packInt4x8(v42_low), packInt4x8(v42_high)); + ivec4 v43_low = v43 & ivec4(255); + v43_low = mix(v43_low, v43_low - ivec4(256), greaterThanEqual(v43_low, ivec4(128))); + ivec4 v43_high = (v43 - v43_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 27 * v_tm_step] = ivec2(packInt4x8(v43_low), packInt4x8(v43_high)); + ivec4 v44_low = v44 & ivec4(255); + v44_low = mix(v44_low, v44_low - ivec4(256), greaterThanEqual(v44_low, ivec4(128))); + ivec4 v44_high = (v44 - v44_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 28 * v_tm_step] = ivec2(packInt4x8(v44_low), packInt4x8(v44_high)); + ivec4 v45_low = v45 & ivec4(255); + v45_low = mix(v45_low, v45_low - ivec4(256), greaterThanEqual(v45_low, ivec4(128))); + ivec4 v45_high = (v45 - v45_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 29 * v_tm_step] = ivec2(packInt4x8(v45_low), packInt4x8(v45_high)); + ivec4 v50_low = v50 & ivec4(255); + v50_low = mix(v50_low, v50_low - ivec4(256), greaterThanEqual(v50_low, ivec4(128))); + ivec4 v50_high = (v50 - v50_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 30 * v_tm_step] = ivec2(packInt4x8(v50_low), packInt4x8(v50_high)); + ivec4 v51_low = v51 & ivec4(255); + v51_low = mix(v51_low, v51_low - ivec4(256), greaterThanEqual(v51_low, ivec4(128))); + ivec4 v51_high = (v51 - v51_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 31 * v_tm_step] = ivec2(packInt4x8(v51_low), packInt4x8(v51_high)); + ivec4 v52_low = v52 & ivec4(255); + v52_low = mix(v52_low, v52_low - ivec4(256), greaterThanEqual(v52_low, ivec4(128))); + ivec4 v52_high = (v52 - v52_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 32 * v_tm_step] = ivec2(packInt4x8(v52_low), packInt4x8(v52_high)); + ivec4 v53_low = v53 & ivec4(255); + v53_low = mix(v53_low, v53_low - ivec4(256), greaterThanEqual(v53_low, ivec4(128))); + ivec4 v53_high = (v53 - v53_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 33 * v_tm_step] = ivec2(packInt4x8(v53_low), packInt4x8(v53_high)); + ivec4 v54_low = v54 & ivec4(255); + v54_low = mix(v54_low, v54_low - ivec4(256), greaterThanEqual(v54_low, ivec4(128))); + ivec4 v54_high = (v54 - v54_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 34 * v_tm_step] = ivec2(packInt4x8(v54_low), packInt4x8(v54_high)); + ivec4 v55_low = v55 & ivec4(255); + v55_low = mix(v55_low, v55_low - ivec4(256), greaterThanEqual(v55_low, ivec4(128))); + ivec4 v55_high = (v55 - v55_low) >> 8; + bottom_tm_low_high_data[v_tm_offset + 35 * v_tm_step] = ivec2(packInt4x8(v55_low), packInt4x8(v55_high)); +} diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_output_int8.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_output_int8.comp new file mode 100644 index 000000000000..056ca559e3a9 --- /dev/null +++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_output_int8.comp @@ -0,0 +1,299 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#extension GL_GOOGLE_include_directive : enable +#include "vulkan_activation.comp" + +layout(constant_id = 0) const int bias_term = 0; +layout(constant_id = 1) const int activation_type = 0; +layout(constant_id = 2) const float activation_param_0 = 0; +layout(constant_id = 3) const float activation_param_1 = 0; +layout(constant_id = 4) const int use_int8_requantize = 0; +layout(binding = 0) readonly buffer top_tm_blob { ivec4 top_tm_blob_data[]; }; +layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout(binding = 2) readonly buffer bias_blob { vec4 bias_data[]; }; +layout(binding = 3) readonly buffer weight_descales_blob { vec4 weight_descales_data[]; }; +layout(binding = 4) readonly buffer top_scales_blob { sfp top_scales_data[]; }; +layout(binding = 5) writeonly buffer top_blob_int8 { sint8vec4 top_blob_int8_data[]; }; + +layout(push_constant) uniform parameter +{ + int cstep; + + int block_x; + int block_y; + + int outw; + int outh; + int outcstep; + int outc; +} p; + +void main() +{ + const int gx = int(gl_GlobalInvocationID.x); + const int gy = int(gl_GlobalInvocationID.y); + const int gz = int(gl_GlobalInvocationID.z); + + const int outc4 = (p.outc + 3) / 4; + + if (gx >= p.block_x || gy >= p.block_y || gz >= outc4) + return; + + // load 36 + int v_tm_offset = gz * p.cstep + gy * p.block_x + gx; + int v_tm_step = p.cstep * outc4; + + ivec4 v00 = top_tm_blob_data[v_tm_offset + 0 * v_tm_step]; + ivec4 v01 = top_tm_blob_data[v_tm_offset + 1 * v_tm_step]; + ivec4 v02 = top_tm_blob_data[v_tm_offset + 2 * v_tm_step]; + ivec4 v03 = top_tm_blob_data[v_tm_offset + 3 * v_tm_step]; + ivec4 v04 = top_tm_blob_data[v_tm_offset + 4 * v_tm_step]; + ivec4 v05 = top_tm_blob_data[v_tm_offset + 5 * v_tm_step]; + ivec4 v10 = top_tm_blob_data[v_tm_offset + 6 * v_tm_step]; + ivec4 v11 = top_tm_blob_data[v_tm_offset + 7 * v_tm_step]; + ivec4 v12 = top_tm_blob_data[v_tm_offset + 8 * v_tm_step]; + ivec4 v13 = top_tm_blob_data[v_tm_offset + 9 * v_tm_step]; + ivec4 v14 = top_tm_blob_data[v_tm_offset + 10 * v_tm_step]; + ivec4 v15 = top_tm_blob_data[v_tm_offset + 11 * v_tm_step]; + ivec4 v20 = top_tm_blob_data[v_tm_offset + 12 * v_tm_step]; + ivec4 v21 = top_tm_blob_data[v_tm_offset + 13 * v_tm_step]; + ivec4 v22 = top_tm_blob_data[v_tm_offset + 14 * v_tm_step]; + ivec4 v23 = top_tm_blob_data[v_tm_offset + 15 * v_tm_step]; + ivec4 v24 = top_tm_blob_data[v_tm_offset + 16 * v_tm_step]; + ivec4 v25 = top_tm_blob_data[v_tm_offset + 17 * v_tm_step]; + ivec4 v30 = top_tm_blob_data[v_tm_offset + 18 * v_tm_step]; + ivec4 v31 = top_tm_blob_data[v_tm_offset + 19 * v_tm_step]; + ivec4 v32 = top_tm_blob_data[v_tm_offset + 20 * v_tm_step]; + ivec4 v33 = top_tm_blob_data[v_tm_offset + 21 * v_tm_step]; + ivec4 v34 = top_tm_blob_data[v_tm_offset + 22 * v_tm_step]; + ivec4 v35 = top_tm_blob_data[v_tm_offset + 23 * v_tm_step]; + ivec4 v40 = top_tm_blob_data[v_tm_offset + 24 * v_tm_step]; + ivec4 v41 = top_tm_blob_data[v_tm_offset + 25 * v_tm_step]; + ivec4 v42 = top_tm_blob_data[v_tm_offset + 26 * v_tm_step]; + ivec4 v43 = top_tm_blob_data[v_tm_offset + 27 * v_tm_step]; + ivec4 v44 = top_tm_blob_data[v_tm_offset + 28 * v_tm_step]; + ivec4 v45 = top_tm_blob_data[v_tm_offset + 29 * v_tm_step]; + ivec4 v50 = top_tm_blob_data[v_tm_offset + 30 * v_tm_step]; + ivec4 v51 = top_tm_blob_data[v_tm_offset + 31 * v_tm_step]; + ivec4 v52 = top_tm_blob_data[v_tm_offset + 32 * v_tm_step]; + ivec4 v53 = top_tm_blob_data[v_tm_offset + 33 * v_tm_step]; + ivec4 v54 = top_tm_blob_data[v_tm_offset + 34 * v_tm_step]; + ivec4 v55 = top_tm_blob_data[v_tm_offset + 35 * v_tm_step]; + + // implicit transpose + ivec4 m00 = v00 + v01 + v02 + v03 + v04; + ivec4 m01 = v10 + v11 + v12 + v13 + v14; + ivec4 m02 = v20 + v21 + v22 + v23 + v24; + ivec4 m03 = v30 + v31 + v32 + v33 + v34; + ivec4 m04 = v40 + v41 + v42 + v43 + v44; + ivec4 m05 = (v50 + v51 + v52 + v53 + v54) * 4; + + ivec4 m10 = (v01 - v02) + (v03 - v04) * 2; + ivec4 m11 = (v11 - v12) + (v13 - v14) * 2; + ivec4 m12 = (v21 - v22) + (v23 - v24) * 2; + ivec4 m13 = (v31 - v32) + (v33 - v34) * 2; + ivec4 m14 = (v41 - v42) + (v43 - v44) * 2; + ivec4 m15 = ((v51 - v52) + (v53 - v54) * 2) * 4; + + ivec4 m20 = (v01 + v02) + (v03 + v04) * 4; + ivec4 m21 = (v11 + v12) + (v13 + v14) * 4; + ivec4 m22 = (v21 + v22) + (v23 + v24) * 4; + ivec4 m23 = (v31 + v32) + (v33 + v34) * 4; + ivec4 m24 = (v41 + v42) + (v43 + v44) * 4; + ivec4 m25 = ((v51 + v52) + (v53 + v54) * 4) * 4; + + ivec4 m30 = (v01 - v02) + (v03 - v04) * 8 + v05 * 4; + ivec4 m31 = (v11 - v12) + (v13 - v14) * 8 + v15 * 4; + ivec4 m32 = (v21 - v22) + (v23 - v24) * 8 + v25 * 4; + ivec4 m33 = (v31 - v32) + (v33 - v34) * 8 + v35 * 4; + ivec4 m34 = (v41 - v42) + (v43 - v44) * 8 + v45 * 4; + ivec4 m35 = ((v51 - v52) + (v53 - v54) * 8 + v55 * 4) * 4; + + v00 = m00 + m01 + m02 + m03 + m04; + v10 = m10 + m11 + m12 + m13 + m14; + v20 = m20 + m21 + m22 + m23 + m24; + v30 = m30 + m31 + m32 + m33 + m34; + + v01 = (m01 - m02) + (m03 - m04) * 2; + v11 = (m11 - m12) + (m13 - m14) * 2; + v21 = (m21 - m22) + (m23 - m24) * 2; + v31 = (m31 - m32) + (m33 - m34) * 2; + + v02 = (m01 + m02) + (m03 + m04) * 4; + v12 = (m11 + m12) + (m13 + m14) * 4; + v22 = (m21 + m22) + (m23 + m24) * 4; + v32 = (m31 + m32) + (m33 + m34) * 4; + + v03 = (m01 - m02) + (m03 - m04) * 8 + m05; + v13 = (m11 - m12) + (m13 - m14) * 8 + m15; + v23 = (m21 - m22) + (m23 - m24) * 8 + m25; + v33 = (m31 - m32) + (m33 - m34) * 8 + m35; + + const vec4 descale = vec4(1.f / 576.f) * weight_descales_data[gz]; + + vec4 out00 = vec4(v00) * descale; + vec4 out01 = vec4(v01) * descale; + vec4 out02 = vec4(v02) * descale; + vec4 out03 = vec4(v03) * descale; + vec4 out10 = vec4(v10) * descale; + vec4 out11 = vec4(v11) * descale; + vec4 out12 = vec4(v12) * descale; + vec4 out13 = vec4(v13) * descale; + vec4 out20 = vec4(v20) * descale; + vec4 out21 = vec4(v21) * descale; + vec4 out22 = vec4(v22) * descale; + vec4 out23 = vec4(v23) * descale; + vec4 out30 = vec4(v30) * descale; + vec4 out31 = vec4(v31) * descale; + vec4 out32 = vec4(v32) * descale; + vec4 out33 = vec4(v33) * descale; + + if (bias_term == 1) + { + const vec4 bias_value = bias_data[gz]; + + out00 += bias_value; + out01 += bias_value; + out02 += bias_value; + out03 += bias_value; + out10 += bias_value; + out11 += bias_value; + out12 += bias_value; + out13 += bias_value; + out20 += bias_value; + out21 += bias_value; + out22 += bias_value; + out23 += bias_value; + out30 += bias_value; + out31 += bias_value; + out32 += bias_value; + out33 += bias_value; + } + + out00 = vec4(activation_afpvec4(afpvec4(out00), activation_type, activation_param_0, activation_param_1)); + out01 = vec4(activation_afpvec4(afpvec4(out01), activation_type, activation_param_0, activation_param_1)); + out02 = vec4(activation_afpvec4(afpvec4(out02), activation_type, activation_param_0, activation_param_1)); + out03 = vec4(activation_afpvec4(afpvec4(out03), activation_type, activation_param_0, activation_param_1)); + out10 = vec4(activation_afpvec4(afpvec4(out10), activation_type, activation_param_0, activation_param_1)); + out11 = vec4(activation_afpvec4(afpvec4(out11), activation_type, activation_param_0, activation_param_1)); + out12 = vec4(activation_afpvec4(afpvec4(out12), activation_type, activation_param_0, activation_param_1)); + out13 = vec4(activation_afpvec4(afpvec4(out13), activation_type, activation_param_0, activation_param_1)); + out20 = vec4(activation_afpvec4(afpvec4(out20), activation_type, activation_param_0, activation_param_1)); + out21 = vec4(activation_afpvec4(afpvec4(out21), activation_type, activation_param_0, activation_param_1)); + out22 = vec4(activation_afpvec4(afpvec4(out22), activation_type, activation_param_0, activation_param_1)); + out23 = vec4(activation_afpvec4(afpvec4(out23), activation_type, activation_param_0, activation_param_1)); + out30 = vec4(activation_afpvec4(afpvec4(out30), activation_type, activation_param_0, activation_param_1)); + out31 = vec4(activation_afpvec4(afpvec4(out31), activation_type, activation_param_0, activation_param_1)); + out32 = vec4(activation_afpvec4(afpvec4(out32), activation_type, activation_param_0, activation_param_1)); + out33 = vec4(activation_afpvec4(afpvec4(out33), activation_type, activation_param_0, activation_param_1)); + + // store 4x4 + int x = gx * 4; + int y = gy * 4; + + if (use_int8_requantize == 1) + { + const float top_scale = buffer_ld1(top_scales_data, 0); + + out00 *= top_scale; + out01 *= top_scale; + out02 *= top_scale; + out03 *= top_scale; + out10 *= top_scale; + out11 *= top_scale; + out12 *= top_scale; + out13 *= top_scale; + out20 *= top_scale; + out21 *= top_scale; + out22 *= top_scale; + out23 *= top_scale; + out30 *= top_scale; + out31 *= top_scale; + out32 *= top_scale; + out33 *= top_scale; + + ivec4 out00_int8 = float2int8vec4(out00); + ivec4 out01_int8 = float2int8vec4(out01); + ivec4 out02_int8 = float2int8vec4(out02); + ivec4 out03_int8 = float2int8vec4(out03); + ivec4 out10_int8 = float2int8vec4(out10); + ivec4 out11_int8 = float2int8vec4(out11); + ivec4 out12_int8 = float2int8vec4(out12); + ivec4 out13_int8 = float2int8vec4(out13); + ivec4 out20_int8 = float2int8vec4(out20); + ivec4 out21_int8 = float2int8vec4(out21); + ivec4 out22_int8 = float2int8vec4(out22); + ivec4 out23_int8 = float2int8vec4(out23); + ivec4 out30_int8 = float2int8vec4(out30); + ivec4 out31_int8 = float2int8vec4(out31); + ivec4 out32_int8 = float2int8vec4(out32); + ivec4 out33_int8 = float2int8vec4(out33); + + ivec4 v_offset = gz * p.outcstep + y * p.outw + x + ivec4(0, 1, 2, 3) * p.outw; + + i8buffer_st4(top_blob_int8_data, v_offset.r + 0, out00_int8); + if (x + 1 < p.outw) i8buffer_st4(top_blob_int8_data, v_offset.r + 1, out01_int8); + if (x + 2 < p.outw) i8buffer_st4(top_blob_int8_data, v_offset.r + 2, out02_int8); + if (x + 3 < p.outw) i8buffer_st4(top_blob_int8_data, v_offset.r + 3, out03_int8); + + if (y + 1 < p.outh) + { + i8buffer_st4(top_blob_int8_data, v_offset.g + 0, out10_int8); + if (x + 1 < p.outw) i8buffer_st4(top_blob_int8_data, v_offset.g + 1, out11_int8); + if (x + 2 < p.outw) i8buffer_st4(top_blob_int8_data, v_offset.g + 2, out12_int8); + if (x + 3 < p.outw) i8buffer_st4(top_blob_int8_data, v_offset.g + 3, out13_int8); + } + + if (y + 2 < p.outh) + { + i8buffer_st4(top_blob_int8_data, v_offset.b + 0, out20_int8); + if (x + 1 < p.outw) i8buffer_st4(top_blob_int8_data, v_offset.b + 1, out21_int8); + if (x + 2 < p.outw) i8buffer_st4(top_blob_int8_data, v_offset.b + 2, out22_int8); + if (x + 3 < p.outw) i8buffer_st4(top_blob_int8_data, v_offset.b + 3, out23_int8); + } + + if (y + 3 < p.outh) + { + i8buffer_st4(top_blob_int8_data, v_offset.a + 0, out30_int8); + if (x + 1 < p.outw) i8buffer_st4(top_blob_int8_data, v_offset.a + 1, out31_int8); + if (x + 2 < p.outw) i8buffer_st4(top_blob_int8_data, v_offset.a + 2, out32_int8); + if (x + 3 < p.outw) i8buffer_st4(top_blob_int8_data, v_offset.a + 3, out33_int8); + } + } + else + { + ivec4 v_offset = gz * p.outcstep + y * p.outw + x + ivec4(0, 1, 2, 3) * p.outw; + + buffer_st4(top_blob_data, v_offset.r + 0, afpvec4(out00)); + if (x + 1 < p.outw) buffer_st4(top_blob_data, v_offset.r + 1, afpvec4(out01)); + if (x + 2 < p.outw) buffer_st4(top_blob_data, v_offset.r + 2, afpvec4(out02)); + if (x + 3 < p.outw) buffer_st4(top_blob_data, v_offset.r + 3, afpvec4(out03)); + + if (y + 1 < p.outh) + { + buffer_st4(top_blob_data, v_offset.g + 0, afpvec4(out10)); + if (x + 1 < p.outw) buffer_st4(top_blob_data, v_offset.g + 1, afpvec4(out11)); + if (x + 2 < p.outw) buffer_st4(top_blob_data, v_offset.g + 2, afpvec4(out12)); + if (x + 3 < p.outw) buffer_st4(top_blob_data, v_offset.g + 3, afpvec4(out13)); + } + + if (y + 2 < p.outh) + { + buffer_st4(top_blob_data, v_offset.b + 0, afpvec4(out20)); + if (x + 1 < p.outw) buffer_st4(top_blob_data, v_offset.b + 1, afpvec4(out21)); + if (x + 2 < p.outw) buffer_st4(top_blob_data, v_offset.b + 2, afpvec4(out22)); + if (x + 3 < p.outw) buffer_st4(top_blob_data, v_offset.b + 3, afpvec4(out23)); + } + + if (y + 3 < p.outh) + { + buffer_st4(top_blob_data, v_offset.a + 0, afpvec4(out30)); + if (x + 1 < p.outw) buffer_st4(top_blob_data, v_offset.a + 1, afpvec4(out31)); + if (x + 2 < p.outw) buffer_st4(top_blob_data, v_offset.a + 2, afpvec4(out32)); + if (x + 3 < p.outw) buffer_st4(top_blob_data, v_offset.a + 3, afpvec4(out33)); + } + } +} diff --git a/src/layer/vulkan/shader/convolution_packed_1x1s1d1_int8.comp b/src/layer/vulkan/shader/convolution_packed_1x1s1d1_int8.comp new file mode 100644 index 000000000000..e3e49ed2dcb2 --- /dev/null +++ b/src/layer/vulkan/shader/convolution_packed_1x1s1d1_int8.comp @@ -0,0 +1,496 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated +#extension GL_EXT_integer_dot_product : require +#endif + +#extension GL_GOOGLE_include_directive : enable +#include "vulkan_activation.comp" + +#define LOCAL_MEMORY_UNROLL_INCH 8 + +layout(constant_id = 0) const int bias_term = 0; +layout(constant_id = 1) const int activation_type = 0; +layout(constant_id = 2) const float activation_param_0 = 0; +layout(constant_id = 3) const float activation_param_1 = 0; +layout(constant_id = 4) const int use_int8_requantize = 0; +layout(constant_id = 5) const int elempack = 1; +layout(constant_id = 6) const int out_elempack = 1; + +#define shape_constant_id_offset 7 +layout(constant_id = shape_constant_id_offset + 0) const int c = 0; +layout(constant_id = shape_constant_id_offset + 1) const int cstep = 0; +layout(constant_id = shape_constant_id_offset + 2) const int outc = 0; +layout(constant_id = shape_constant_id_offset + 3) const int outcstep = 0; +layout(constant_id = shape_constant_id_offset + 4) const int outcstep_native = 0; +layout(constant_id = shape_constant_id_offset + 5) const int size = 0; +layout(constant_id = shape_constant_id_offset + 6) const int num_output = 0; +layout(constant_id = shape_constant_id_offset + 7) const int num_input = 0; + +layout(binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_int8_data[]; }; +layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout(binding = 2) readonly buffer weight_blob { ivec4 weight_data[]; }; +layout(binding = 3) readonly buffer bias_blob { vec4 bias_data[]; }; +layout(binding = 4) readonly buffer weight_descales_blob { vec4 weight_descales_data[]; }; +layout(binding = 5) readonly buffer top_scales_blob { sfp top_scales_data[]; }; +layout(binding = 6) writeonly buffer top_blob_int8 { sint8vec4 top_blob_int8_data[]; }; + +layout(push_constant) uniform parameter +{ + int c; + int cstep; + int outc; + int outcstep; + int outcstep_native; + int size; +} p; + +#if NCNN_shader_local_memory +shared ivec4 tmp_v[8][LOCAL_MEMORY_UNROLL_INCH]; +shared ivec4 tmp_k[8][LOCAL_MEMORY_UNROLL_INCH]; +#endif + +void main() +{ + const int gx = int(gl_GlobalInvocationID.x); + const int gy = int(gl_GlobalInvocationID.y); + +#if !NCNN_shader_local_memory + if (gx >= psc(size) || gy >= psc(outc)) + return; +#endif + + const int base_pos = gx * 4; + + ivec4 sum0 = ivec4(0); + ivec4 sum1 = ivec4(0); + ivec4 sum2 = ivec4(0); + ivec4 sum3 = ivec4(0); + + int w_offset = gy * psc(c) * 4; + +#if NCNN_shader_local_memory + const int lx = int(gl_LocalInvocationID.x); + const int ly = int(gl_LocalInvocationID.y); + + int z = 0; + for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH) + { + if (ly < LOCAL_MEMORY_UNROLL_INCH) + { + const int zz = z + ly; + + if (elempack == 4) + { + int v0p = 0; + int v1p = 0; + int v2p = 0; + int v3p = 0; + + if (gx < psc(size)) + { + const int v_offset = base_pos + zz * psc(cstep); + v0p = i8buffer_sm4(bottom_blob_int8_data, v_offset + 0); + v1p = i8buffer_sm4(bottom_blob_int8_data, v_offset + 1); + v2p = i8buffer_sm4(bottom_blob_int8_data, v_offset + 2); + v3p = i8buffer_sm4(bottom_blob_int8_data, v_offset + 3); + } + + tmp_v[lx][ly] = ivec4(v0p, v1p, v2p, v3p); + } + else // elempack == 1 + { + const int ci = zz * 4; + + int r0p = 0; + int r1p = 0; + int r2p = 0; + int r3p = 0; + + if (gx < psc(size)) + { + r0p = i8buffer_sm4(bottom_blob_int8_data, gx + ci * psc(cstep)); + if (ci + 1 < num_input) r1p = i8buffer_sm4(bottom_blob_int8_data, gx + (ci + 1) * psc(cstep)); + if (ci + 2 < num_input) r2p = i8buffer_sm4(bottom_blob_int8_data, gx + (ci + 2) * psc(cstep)); + if (ci + 3 < num_input) r3p = i8buffer_sm4(bottom_blob_int8_data, gx + (ci + 3) * psc(cstep)); + } + + const ivec4 r0 = unpackInt4x8(r0p); + const ivec4 r1 = unpackInt4x8(r1p); + const ivec4 r2 = unpackInt4x8(r2p); + const ivec4 r3 = unpackInt4x8(r3p); + + tmp_v[lx][ly] = ivec4(packInt4x8(ivec4(r0.r, r1.r, r2.r, r3.r)), + packInt4x8(ivec4(r0.g, r1.g, r2.g, r3.g)), + packInt4x8(ivec4(r0.b, r1.b, r2.b, r3.b)), + packInt4x8(ivec4(r0.a, r1.a, r2.a, r3.a))); + } + } + + if (lx < LOCAL_MEMORY_UNROLL_INCH) + { + tmp_k[ly][lx] = weight_data[w_offset / 4 + lx]; + } + + barrier(); + + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + const ivec4 vp = tmp_v[lx][z4]; + const int v0p = vp.r; + const int v1p = vp.g; + const int v2p = vp.b; + const int v3p = vp.a; + + const ivec4 kp = tmp_k[ly][z4]; + const int k0p = kp.r; + const int k1p = kp.g; + const int k2p = kp.b; + const int k3p = kp.a; + +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated + sum0 += ivec4(dotPacked4x8EXT(v0p, k0p), dotPacked4x8EXT(v0p, k1p), dotPacked4x8EXT(v0p, k2p), dotPacked4x8EXT(v0p, k3p)); + sum1 += ivec4(dotPacked4x8EXT(v1p, k0p), dotPacked4x8EXT(v1p, k1p), dotPacked4x8EXT(v1p, k2p), dotPacked4x8EXT(v1p, k3p)); + sum2 += ivec4(dotPacked4x8EXT(v2p, k0p), dotPacked4x8EXT(v2p, k1p), dotPacked4x8EXT(v2p, k2p), dotPacked4x8EXT(v2p, k3p)); + sum3 += ivec4(dotPacked4x8EXT(v3p, k0p), dotPacked4x8EXT(v3p, k1p), dotPacked4x8EXT(v3p, k2p), dotPacked4x8EXT(v3p, k3p)); +#else + const ivec4 v0 = unpackInt4x8(v0p); + const ivec4 v1 = unpackInt4x8(v1p); + const ivec4 v2 = unpackInt4x8(v2p); + const ivec4 v3 = unpackInt4x8(v3p); + + const ivec4 k0 = unpackInt4x8(k0p); + const ivec4 k1 = unpackInt4x8(k1p); + const ivec4 k2 = unpackInt4x8(k2p); + const ivec4 k3 = unpackInt4x8(k3p); + + sum0 += ivec4(v0.r * k0.r + v0.g * k0.g + v0.b * k0.b + v0.a * k0.a, + v0.r * k1.r + v0.g * k1.g + v0.b * k1.b + v0.a * k1.a, + v0.r * k2.r + v0.g * k2.g + v0.b * k2.b + v0.a * k2.a, + v0.r * k3.r + v0.g * k3.g + v0.b * k3.b + v0.a * k3.a); + sum1 += ivec4(v1.r * k0.r + v1.g * k0.g + v1.b * k0.b + v1.a * k0.a, + v1.r * k1.r + v1.g * k1.g + v1.b * k1.b + v1.a * k1.a, + v1.r * k2.r + v1.g * k2.g + v1.b * k2.b + v1.a * k2.a, + v1.r * k3.r + v1.g * k3.g + v1.b * k3.b + v1.a * k3.a); + sum2 += ivec4(v2.r * k0.r + v2.g * k0.g + v2.b * k0.b + v2.a * k0.a, + v2.r * k1.r + v2.g * k1.g + v2.b * k1.b + v2.a * k1.a, + v2.r * k2.r + v2.g * k2.g + v2.b * k2.b + v2.a * k2.a, + v2.r * k3.r + v2.g * k3.g + v2.b * k3.b + v2.a * k3.a); + sum3 += ivec4(v3.r * k0.r + v3.g * k0.g + v3.b * k0.b + v3.a * k0.a, + v3.r * k1.r + v3.g * k1.g + v3.b * k1.b + v3.a * k1.a, + v3.r * k2.r + v3.g * k2.g + v3.b * k2.b + v3.a * k2.a, + v3.r * k3.r + v3.g * k3.g + v3.b * k3.b + v3.a * k3.a); +#endif + } + + w_offset += LOCAL_MEMORY_UNROLL_INCH * 4; + + barrier(); + } + + if (z < psc(c)) + { + const int remain = psc(c) - z; + + if (ly < remain) + { + const int zz = z + ly; + + if (elempack == 4) + { + int v0p = 0; + int v1p = 0; + int v2p = 0; + int v3p = 0; + + if (gx < psc(size)) + { + const int v_offset = base_pos + zz * psc(cstep); + v0p = i8buffer_sm4(bottom_blob_int8_data, v_offset + 0); + v1p = i8buffer_sm4(bottom_blob_int8_data, v_offset + 1); + v2p = i8buffer_sm4(bottom_blob_int8_data, v_offset + 2); + v3p = i8buffer_sm4(bottom_blob_int8_data, v_offset + 3); + } + + tmp_v[lx][ly] = ivec4(v0p, v1p, v2p, v3p); + } + else // elempack == 1 + { + const int ci = zz * 4; + + int r0p = 0; + int r1p = 0; + int r2p = 0; + int r3p = 0; + + if (gx < psc(size)) + { + r0p = i8buffer_sm4(bottom_blob_int8_data, gx + ci * psc(cstep)); + if (ci + 1 < num_input) r1p = i8buffer_sm4(bottom_blob_int8_data, gx + (ci + 1) * psc(cstep)); + if (ci + 2 < num_input) r2p = i8buffer_sm4(bottom_blob_int8_data, gx + (ci + 2) * psc(cstep)); + if (ci + 3 < num_input) r3p = i8buffer_sm4(bottom_blob_int8_data, gx + (ci + 3) * psc(cstep)); + } + + const ivec4 r0 = unpackInt4x8(r0p); + const ivec4 r1 = unpackInt4x8(r1p); + const ivec4 r2 = unpackInt4x8(r2p); + const ivec4 r3 = unpackInt4x8(r3p); + + tmp_v[lx][ly] = ivec4(packInt4x8(ivec4(r0.r, r1.r, r2.r, r3.r)), + packInt4x8(ivec4(r0.g, r1.g, r2.g, r3.g)), + packInt4x8(ivec4(r0.b, r1.b, r2.b, r3.b)), + packInt4x8(ivec4(r0.a, r1.a, r2.a, r3.a))); + } + } + + if (lx < remain) + { + tmp_k[ly][lx] = weight_data[w_offset / 4 + lx]; + } + + barrier(); + + for (int z4 = 0; z4 < remain; z4++) + { + const ivec4 vp = tmp_v[lx][z4]; + const int v0p = vp.r; + const int v1p = vp.g; + const int v2p = vp.b; + const int v3p = vp.a; + + const ivec4 kp = tmp_k[ly][z4]; + const int k0p = kp.r; + const int k1p = kp.g; + const int k2p = kp.b; + const int k3p = kp.a; + +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated + sum0 += ivec4(dotPacked4x8EXT(v0p, k0p), dotPacked4x8EXT(v0p, k1p), dotPacked4x8EXT(v0p, k2p), dotPacked4x8EXT(v0p, k3p)); + sum1 += ivec4(dotPacked4x8EXT(v1p, k0p), dotPacked4x8EXT(v1p, k1p), dotPacked4x8EXT(v1p, k2p), dotPacked4x8EXT(v1p, k3p)); + sum2 += ivec4(dotPacked4x8EXT(v2p, k0p), dotPacked4x8EXT(v2p, k1p), dotPacked4x8EXT(v2p, k2p), dotPacked4x8EXT(v2p, k3p)); + sum3 += ivec4(dotPacked4x8EXT(v3p, k0p), dotPacked4x8EXT(v3p, k1p), dotPacked4x8EXT(v3p, k2p), dotPacked4x8EXT(v3p, k3p)); +#else + const ivec4 v0 = unpackInt4x8(v0p); + const ivec4 v1 = unpackInt4x8(v1p); + const ivec4 v2 = unpackInt4x8(v2p); + const ivec4 v3 = unpackInt4x8(v3p); + + const ivec4 k0 = unpackInt4x8(k0p); + const ivec4 k1 = unpackInt4x8(k1p); + const ivec4 k2 = unpackInt4x8(k2p); + const ivec4 k3 = unpackInt4x8(k3p); + + sum0 += ivec4(v0.r * k0.r + v0.g * k0.g + v0.b * k0.b + v0.a * k0.a, + v0.r * k1.r + v0.g * k1.g + v0.b * k1.b + v0.a * k1.a, + v0.r * k2.r + v0.g * k2.g + v0.b * k2.b + v0.a * k2.a, + v0.r * k3.r + v0.g * k3.g + v0.b * k3.b + v0.a * k3.a); + sum1 += ivec4(v1.r * k0.r + v1.g * k0.g + v1.b * k0.b + v1.a * k0.a, + v1.r * k1.r + v1.g * k1.g + v1.b * k1.b + v1.a * k1.a, + v1.r * k2.r + v1.g * k2.g + v1.b * k2.b + v1.a * k2.a, + v1.r * k3.r + v1.g * k3.g + v1.b * k3.b + v1.a * k3.a); + sum2 += ivec4(v2.r * k0.r + v2.g * k0.g + v2.b * k0.b + v2.a * k0.a, + v2.r * k1.r + v2.g * k1.g + v2.b * k1.b + v2.a * k1.a, + v2.r * k2.r + v2.g * k2.g + v2.b * k2.b + v2.a * k2.a, + v2.r * k3.r + v2.g * k3.g + v2.b * k3.b + v2.a * k3.a); + sum3 += ivec4(v3.r * k0.r + v3.g * k0.g + v3.b * k0.b + v3.a * k0.a, + v3.r * k1.r + v3.g * k1.g + v3.b * k1.b + v3.a * k1.a, + v3.r * k2.r + v3.g * k2.g + v3.b * k2.b + v3.a * k2.a, + v3.r * k3.r + v3.g * k3.g + v3.b * k3.b + v3.a * k3.a); +#endif + } + } +#else + for (int z = 0; z < psc(c); z++) + { + const ivec4 kp = weight_data[w_offset / 4]; + +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated + int v0p; + int v1p; + int v2p; + int v3p; + + if (elempack == 4) + { + const int v_offset = base_pos + z * psc(cstep); + v0p = i8buffer_sm4(bottom_blob_int8_data, v_offset + 0); + v1p = i8buffer_sm4(bottom_blob_int8_data, v_offset + 1); + v2p = i8buffer_sm4(bottom_blob_int8_data, v_offset + 2); + v3p = i8buffer_sm4(bottom_blob_int8_data, v_offset + 3); + } + else // elempack == 1 + { + const int v_offset = gx + z * 4 * psc(cstep); + const int ci = z * 4; + const ivec4 r0 = i8buffer_ld4(bottom_blob_int8_data, v_offset); + const ivec4 r1 = ci + 1 < num_input ? i8buffer_ld4(bottom_blob_int8_data, v_offset + psc(cstep)) : ivec4(0); + const ivec4 r2 = ci + 2 < num_input ? i8buffer_ld4(bottom_blob_int8_data, v_offset + psc(cstep) * 2) : ivec4(0); + const ivec4 r3 = ci + 3 < num_input ? i8buffer_ld4(bottom_blob_int8_data, v_offset + psc(cstep) * 3) : ivec4(0); + + v0p = packInt4x8(ivec4(r0.r, r1.r, r2.r, r3.r)); + v1p = packInt4x8(ivec4(r0.g, r1.g, r2.g, r3.g)); + v2p = packInt4x8(ivec4(r0.b, r1.b, r2.b, r3.b)); + v3p = packInt4x8(ivec4(r0.a, r1.a, r2.a, r3.a)); + } + + const int k0p = kp.r; + const int k1p = kp.g; + const int k2p = kp.b; + const int k3p = kp.a; + + sum0 += ivec4(dotPacked4x8EXT(v0p, k0p), dotPacked4x8EXT(v0p, k1p), dotPacked4x8EXT(v0p, k2p), dotPacked4x8EXT(v0p, k3p)); + sum1 += ivec4(dotPacked4x8EXT(v1p, k0p), dotPacked4x8EXT(v1p, k1p), dotPacked4x8EXT(v1p, k2p), dotPacked4x8EXT(v1p, k3p)); + sum2 += ivec4(dotPacked4x8EXT(v2p, k0p), dotPacked4x8EXT(v2p, k1p), dotPacked4x8EXT(v2p, k2p), dotPacked4x8EXT(v2p, k3p)); + sum3 += ivec4(dotPacked4x8EXT(v3p, k0p), dotPacked4x8EXT(v3p, k1p), dotPacked4x8EXT(v3p, k2p), dotPacked4x8EXT(v3p, k3p)); +#else + ivec4 v0; + ivec4 v1; + ivec4 v2; + ivec4 v3; + + if (elempack == 4) + { + const int v_offset = base_pos + z * psc(cstep); + v0 = i8buffer_ld4(bottom_blob_int8_data, v_offset + 0); + v1 = i8buffer_ld4(bottom_blob_int8_data, v_offset + 1); + v2 = i8buffer_ld4(bottom_blob_int8_data, v_offset + 2); + v3 = i8buffer_ld4(bottom_blob_int8_data, v_offset + 3); + } + else // elempack == 1 + { + const int v_offset = gx + z * 4 * psc(cstep); + const int ci = z * 4; + const ivec4 r0 = i8buffer_ld4(bottom_blob_int8_data, v_offset); + const ivec4 r1 = ci + 1 < num_input ? i8buffer_ld4(bottom_blob_int8_data, v_offset + psc(cstep)) : ivec4(0); + const ivec4 r2 = ci + 2 < num_input ? i8buffer_ld4(bottom_blob_int8_data, v_offset + psc(cstep) * 2) : ivec4(0); + const ivec4 r3 = ci + 3 < num_input ? i8buffer_ld4(bottom_blob_int8_data, v_offset + psc(cstep) * 3) : ivec4(0); + + v0 = ivec4(r0.r, r1.r, r2.r, r3.r); + v1 = ivec4(r0.g, r1.g, r2.g, r3.g); + v2 = ivec4(r0.b, r1.b, r2.b, r3.b); + v3 = ivec4(r0.a, r1.a, r2.a, r3.a); + } + + const ivec4 k0 = unpackInt4x8(kp.r); + const ivec4 k1 = unpackInt4x8(kp.g); + const ivec4 k2 = unpackInt4x8(kp.b); + const ivec4 k3 = unpackInt4x8(kp.a); + + sum0 += ivec4(v0.r * k0.r + v0.g * k0.g + v0.b * k0.b + v0.a * k0.a, + v0.r * k1.r + v0.g * k1.g + v0.b * k1.b + v0.a * k1.a, + v0.r * k2.r + v0.g * k2.g + v0.b * k2.b + v0.a * k2.a, + v0.r * k3.r + v0.g * k3.g + v0.b * k3.b + v0.a * k3.a); + sum1 += ivec4(v1.r * k0.r + v1.g * k0.g + v1.b * k0.b + v1.a * k0.a, + v1.r * k1.r + v1.g * k1.g + v1.b * k1.b + v1.a * k1.a, + v1.r * k2.r + v1.g * k2.g + v1.b * k2.b + v1.a * k2.a, + v1.r * k3.r + v1.g * k3.g + v1.b * k3.b + v1.a * k3.a); + sum2 += ivec4(v2.r * k0.r + v2.g * k0.g + v2.b * k0.b + v2.a * k0.a, + v2.r * k1.r + v2.g * k1.g + v2.b * k1.b + v2.a * k1.a, + v2.r * k2.r + v2.g * k2.g + v2.b * k2.b + v2.a * k2.a, + v2.r * k3.r + v2.g * k3.g + v2.b * k3.b + v2.a * k3.a); + sum3 += ivec4(v3.r * k0.r + v3.g * k0.g + v3.b * k0.b + v3.a * k0.a, + v3.r * k1.r + v3.g * k1.g + v3.b * k1.b + v3.a * k1.a, + v3.r * k2.r + v3.g * k2.g + v3.b * k2.b + v3.a * k2.a, + v3.r * k3.r + v3.g * k3.g + v3.b * k3.b + v3.a * k3.a); +#endif + + w_offset += 4; + } +#endif + +#if NCNN_shader_local_memory + if (gx >= psc(size) || gy >= psc(outc)) + return; +#endif + + const int base_ch = gy * 4; + const int nout = num_output; + const vec4 descale = weight_descales_data[gy]; + + vec4 sumfp32_0 = vec4(sum0) * descale; + vec4 sumfp32_1 = vec4(sum1) * descale; + vec4 sumfp32_2 = vec4(sum2) * descale; + vec4 sumfp32_3 = vec4(sum3) * descale; + + if (bias_term == 1) + { + const vec4 bias = bias_data[gy]; + + sumfp32_0 += bias; + sumfp32_1 += bias; + sumfp32_2 += bias; + sumfp32_3 += bias; + } + + sumfp32_0 = vec4(activation_afpvec4(afpvec4(sumfp32_0), activation_type, activation_param_0, activation_param_1)); + sumfp32_1 = vec4(activation_afpvec4(afpvec4(sumfp32_1), activation_type, activation_param_0, activation_param_1)); + sumfp32_2 = vec4(activation_afpvec4(afpvec4(sumfp32_2), activation_type, activation_param_0, activation_param_1)); + sumfp32_3 = vec4(activation_afpvec4(afpvec4(sumfp32_3), activation_type, activation_param_0, activation_param_1)); + + if (use_int8_requantize == 1) + { + const float top_scale = buffer_ld1(top_scales_data, 0); + sumfp32_0 *= top_scale; + sumfp32_1 *= top_scale; + sumfp32_2 *= top_scale; + sumfp32_3 *= top_scale; + + const ivec4 v0 = float2int8vec4(sumfp32_0); + const ivec4 v1 = float2int8vec4(sumfp32_1); + const ivec4 v2 = float2int8vec4(sumfp32_2); + const ivec4 v3 = float2int8vec4(sumfp32_3); + + if (out_elempack == 4) + { + const int gi = gy * psc(outcstep) + gx * 4; + + i8buffer_st4(top_blob_int8_data, gi + 0, v0); + if (gx * 4 + 1 < psc(outcstep)) i8buffer_st4(top_blob_int8_data, gi + 1, v1); + if (gx * 4 + 2 < psc(outcstep)) i8buffer_st4(top_blob_int8_data, gi + 2, v2); + if (gx * 4 + 3 < psc(outcstep)) i8buffer_st4(top_blob_int8_data, gi + 3, v3); + } + else // out_elempack == 1 + { + const ivec4 o0 = ivec4(v0.r, v1.r, v2.r, v3.r); + const ivec4 o1 = ivec4(v0.g, v1.g, v2.g, v3.g); + const ivec4 o2 = ivec4(v0.b, v1.b, v2.b, v3.b); + const ivec4 o3 = ivec4(v0.a, v1.a, v2.a, v3.a); + + const int gi = base_ch * psc(outcstep_native) + gx; + + i8buffer_st4(top_blob_int8_data, gi, o0); + if (base_ch + 1 < nout) i8buffer_st4(top_blob_int8_data, gi + psc(outcstep_native), o1); + if (base_ch + 2 < nout) i8buffer_st4(top_blob_int8_data, gi + psc(outcstep_native) * 2, o2); + if (base_ch + 3 < nout) i8buffer_st4(top_blob_int8_data, gi + psc(outcstep_native) * 3, o3); + } + } + else + { + if (out_elempack == 4) + { + const int gi = gy * psc(outcstep) + gx * 4; + + buffer_st4(top_blob_data, gi + 0, afpvec4(sumfp32_0)); + if (gx * 4 + 1 < psc(outcstep)) buffer_st4(top_blob_data, gi + 1, afpvec4(sumfp32_1)); + if (gx * 4 + 2 < psc(outcstep)) buffer_st4(top_blob_data, gi + 2, afpvec4(sumfp32_2)); + if (gx * 4 + 3 < psc(outcstep)) buffer_st4(top_blob_data, gi + 3, afpvec4(sumfp32_3)); + } + else // out_elempack == 1 + { + const vec4 o0 = vec4(sumfp32_0.r, sumfp32_1.r, sumfp32_2.r, sumfp32_3.r); + const vec4 o1 = vec4(sumfp32_0.g, sumfp32_1.g, sumfp32_2.g, sumfp32_3.g); + const vec4 o2 = vec4(sumfp32_0.b, sumfp32_1.b, sumfp32_2.b, sumfp32_3.b); + const vec4 o3 = vec4(sumfp32_0.a, sumfp32_1.a, sumfp32_2.a, sumfp32_3.a); + + const int gi = base_ch * psc(outcstep_native) + gx; + + buffer_st4(top_blob_data, gi, afpvec4(o0)); + if (base_ch + 1 < nout) buffer_st4(top_blob_data, gi + psc(outcstep_native), afpvec4(o1)); + if (base_ch + 2 < nout) buffer_st4(top_blob_data, gi + psc(outcstep_native) * 2, afpvec4(o2)); + if (base_ch + 3 < nout) buffer_st4(top_blob_data, gi + psc(outcstep_native) * 3, afpvec4(o3)); + } + } +} diff --git a/src/layer/vulkan/shader/convolution_packed_gemm_int8.comp b/src/layer/vulkan/shader/convolution_packed_gemm_int8.comp new file mode 100644 index 000000000000..cc45e90be915 --- /dev/null +++ b/src/layer/vulkan/shader/convolution_packed_gemm_int8.comp @@ -0,0 +1,608 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated +#extension GL_EXT_integer_dot_product : require +#endif + +#extension GL_GOOGLE_include_directive : enable +#include "vulkan_activation.comp" + +#define LOCAL_MEMORY_UNROLL_INCH 8 + +layout(constant_id = 0) const int kernel_w = 1; +layout(constant_id = 1) const int kernel_h = 1; +layout(constant_id = 2) const int dilation_w = 1; +layout(constant_id = 3) const int dilation_h = 1; +layout(constant_id = 4) const int stride_w = 1; +layout(constant_id = 5) const int stride_h = 1; +layout(constant_id = 6) const int bias_term = 0; +layout(constant_id = 7) const int activation_type = 0; +layout(constant_id = 8) const float activation_param_0 = 0; +layout(constant_id = 9) const float activation_param_1 = 0; +layout(constant_id = 10) const int use_int8_requantize = 0; +layout(constant_id = 11) const int elempack = 1; +layout(constant_id = 12) const int out_elempack = 1; + +#define shape_constant_id_offset 13 +layout(constant_id = shape_constant_id_offset + 0) const int w = 0; +layout(constant_id = shape_constant_id_offset + 1) const int h = 0; +layout(constant_id = shape_constant_id_offset + 2) const int c = 0; +layout(constant_id = shape_constant_id_offset + 3) const int cstep = 0; +layout(constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout(constant_id = shape_constant_id_offset + 5) const int outh = 0; +layout(constant_id = shape_constant_id_offset + 6) const int outc = 0; +layout(constant_id = shape_constant_id_offset + 7) const int outcstep = 0; +layout(constant_id = shape_constant_id_offset + 8) const int num_output = 0; +layout(constant_id = shape_constant_id_offset + 9) const int num_input = 0; + +layout(binding = 0) readonly buffer bottom_blob_1 { sint8 bottom_blob_int8_data_1[]; }; +layout(binding = 1) writeonly buffer top_blob_1 { sfp top_blob_data_1[]; }; +layout(binding = 2) readonly buffer bottom_blob_4 { sint8vec4 bottom_blob_int8_data_4[]; }; +layout(binding = 3) writeonly buffer top_blob_4 { sfpvec4 top_blob_data_4[]; }; +layout(binding = 4) readonly buffer weight_blob { ivec4 weight_data[]; }; +layout(binding = 5) readonly buffer bias_blob { vec4 bias_data[]; }; +layout(binding = 6) readonly buffer weight_descales_blob { vec4 weight_descales_data[]; }; +layout(binding = 7) readonly buffer top_scales_blob { sfp top_scales_data[]; }; +layout(binding = 8) writeonly buffer top_blob_int8_1 { sint8 top_blob_int8_data_1[]; }; +layout(binding = 9) writeonly buffer top_blob_int8_4 { sint8vec4 top_blob_int8_data_4[]; }; + +layout(push_constant) uniform parameter +{ + int w; + int h; + int c; + int cstep; + + int outw; + int outh; + int outc; + int outcstep; +} p; + +#if NCNN_shader_local_memory +shared ivec4 tmp_v[8][LOCAL_MEMORY_UNROLL_INCH]; +shared ivec4 tmp_k[8][LOCAL_MEMORY_UNROLL_INCH]; +#endif + +void main() +{ + const int gx = int(gl_GlobalInvocationID.x) * 4; + const int gy = int(gl_GlobalInvocationID.y); + + const int outsize = psc(outw) * psc(outh); + +#if !NCNN_shader_local_memory + if (gx >= outsize || gy >= psc(outc)) + return; +#endif + + const int maxk = kernel_w * kernel_h; + const int N = psc(c) * maxk; + const ivec4 gx4 = gx + ivec4(0, 1, 2, 3); + + const ivec4 sy4 = gx4 / psc(outw); + const ivec4 sx4 = gx4 % psc(outw); + + const ivec4 sxs4 = sx4 * stride_w; + const ivec4 sys4 = sy4 * stride_h; + + ivec4 sum0 = ivec4(0); + ivec4 sum1 = ivec4(0); + ivec4 sum2 = ivec4(0); + ivec4 sum3 = ivec4(0); + + int w_offset = gy * N * 4; + +#if NCNN_shader_local_memory + const int lx = int(gl_LocalInvocationID.x); + const int ly = int(gl_LocalInvocationID.y); + + int z = 0; + for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < N; z += LOCAL_MEMORY_UNROLL_INCH) + { + if (ly < LOCAL_MEMORY_UNROLL_INCH) + { + const int zz = z + ly; + const int sz = zz / maxk; + const int k = zz - sz * maxk; + const int ky = k / kernel_w; + const int kx = k - ky * kernel_w; + + int v0p = 0; + int v1p = 0; + int v2p = 0; + int v3p = 0; + + if (elempack == 4) + { + const ivec4 v_offset = sz * psc(cstep) + (sys4 + ky * dilation_h) * psc(w) + sxs4 + kx * dilation_w; + + if (gx4.r < outsize) v0p = i8buffer_sm4(bottom_blob_int8_data_4, v_offset.r); + if (gx4.g < outsize) v1p = i8buffer_sm4(bottom_blob_int8_data_4, v_offset.g); + if (gx4.b < outsize) v2p = i8buffer_sm4(bottom_blob_int8_data_4, v_offset.b); + if (gx4.a < outsize) v3p = i8buffer_sm4(bottom_blob_int8_data_4, v_offset.a); + } + else // elempack == 1 + { + const int ch0 = sz * 4; + const ivec4 base_spatial = (sys4 + ky * dilation_h) * psc(w) + sxs4 + kx * dilation_w; + + if (gx4.r < outsize) + { + ivec4 v = ivec4(0); + v.r = i8buffer_ld1(bottom_blob_int8_data_1, ch0 * psc(cstep) + base_spatial.r); + if (ch0 + 1 < num_input) v.g = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 1) * psc(cstep) + base_spatial.r); + if (ch0 + 2 < num_input) v.b = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 2) * psc(cstep) + base_spatial.r); + if (ch0 + 3 < num_input) v.a = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 3) * psc(cstep) + base_spatial.r); + + v0p = packInt4x8(v); + } + if (gx4.g < outsize) + { + ivec4 v = ivec4(0); + v.r = i8buffer_ld1(bottom_blob_int8_data_1, ch0 * psc(cstep) + base_spatial.g); + if (ch0 + 1 < num_input) v.g = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 1) * psc(cstep) + base_spatial.g); + if (ch0 + 2 < num_input) v.b = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 2) * psc(cstep) + base_spatial.g); + if (ch0 + 3 < num_input) v.a = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 3) * psc(cstep) + base_spatial.g); + + v1p = packInt4x8(v); + } + if (gx4.b < outsize) + { + ivec4 v = ivec4(0); + v.r = i8buffer_ld1(bottom_blob_int8_data_1, ch0 * psc(cstep) + base_spatial.b); + if (ch0 + 1 < num_input) v.g = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 1) * psc(cstep) + base_spatial.b); + if (ch0 + 2 < num_input) v.b = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 2) * psc(cstep) + base_spatial.b); + if (ch0 + 3 < num_input) v.a = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 3) * psc(cstep) + base_spatial.b); + + v2p = packInt4x8(v); + } + if (gx4.a < outsize) + { + ivec4 v = ivec4(0); + v.r = i8buffer_ld1(bottom_blob_int8_data_1, ch0 * psc(cstep) + base_spatial.a); + if (ch0 + 1 < num_input) v.g = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 1) * psc(cstep) + base_spatial.a); + if (ch0 + 2 < num_input) v.b = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 2) * psc(cstep) + base_spatial.a); + if (ch0 + 3 < num_input) v.a = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 3) * psc(cstep) + base_spatial.a); + + v3p = packInt4x8(v); + } + } + + tmp_v[lx][ly] = ivec4(v0p, v1p, v2p, v3p); + } + + if (lx < LOCAL_MEMORY_UNROLL_INCH) + { + tmp_k[ly][lx] = weight_data[w_offset / 4 + lx]; + } + + barrier(); + + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + const ivec4 vp = tmp_v[lx][z4]; + const int v0p = vp.r; + const int v1p = vp.g; + const int v2p = vp.b; + const int v3p = vp.a; + + const ivec4 kp = tmp_k[ly][z4]; + const int k0p = kp.r; + const int k1p = kp.g; + const int k2p = kp.b; + const int k3p = kp.a; + +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated + sum0 += ivec4(dotPacked4x8EXT(v0p, k0p), dotPacked4x8EXT(v0p, k1p), dotPacked4x8EXT(v0p, k2p), dotPacked4x8EXT(v0p, k3p)); + sum1 += ivec4(dotPacked4x8EXT(v1p, k0p), dotPacked4x8EXT(v1p, k1p), dotPacked4x8EXT(v1p, k2p), dotPacked4x8EXT(v1p, k3p)); + sum2 += ivec4(dotPacked4x8EXT(v2p, k0p), dotPacked4x8EXT(v2p, k1p), dotPacked4x8EXT(v2p, k2p), dotPacked4x8EXT(v2p, k3p)); + sum3 += ivec4(dotPacked4x8EXT(v3p, k0p), dotPacked4x8EXT(v3p, k1p), dotPacked4x8EXT(v3p, k2p), dotPacked4x8EXT(v3p, k3p)); +#else + const ivec4 v0 = unpackInt4x8(v0p); + const ivec4 v1 = unpackInt4x8(v1p); + const ivec4 v2 = unpackInt4x8(v2p); + const ivec4 v3 = unpackInt4x8(v3p); + + const ivec4 k0 = unpackInt4x8(k0p); + const ivec4 k1 = unpackInt4x8(k1p); + const ivec4 k2 = unpackInt4x8(k2p); + const ivec4 k3 = unpackInt4x8(k3p); + + sum0 += ivec4(v0.r * k0.r + v0.g * k0.g + v0.b * k0.b + v0.a * k0.a, + v0.r * k1.r + v0.g * k1.g + v0.b * k1.b + v0.a * k1.a, + v0.r * k2.r + v0.g * k2.g + v0.b * k2.b + v0.a * k2.a, + v0.r * k3.r + v0.g * k3.g + v0.b * k3.b + v0.a * k3.a); + sum1 += ivec4(v1.r * k0.r + v1.g * k0.g + v1.b * k0.b + v1.a * k0.a, + v1.r * k1.r + v1.g * k1.g + v1.b * k1.b + v1.a * k1.a, + v1.r * k2.r + v1.g * k2.g + v1.b * k2.b + v1.a * k2.a, + v1.r * k3.r + v1.g * k3.g + v1.b * k3.b + v1.a * k3.a); + sum2 += ivec4(v2.r * k0.r + v2.g * k0.g + v2.b * k0.b + v2.a * k0.a, + v2.r * k1.r + v2.g * k1.g + v2.b * k1.b + v2.a * k1.a, + v2.r * k2.r + v2.g * k2.g + v2.b * k2.b + v2.a * k2.a, + v2.r * k3.r + v2.g * k3.g + v2.b * k3.b + v2.a * k3.a); + sum3 += ivec4(v3.r * k0.r + v3.g * k0.g + v3.b * k0.b + v3.a * k0.a, + v3.r * k1.r + v3.g * k1.g + v3.b * k1.b + v3.a * k1.a, + v3.r * k2.r + v3.g * k2.g + v3.b * k2.b + v3.a * k2.a, + v3.r * k3.r + v3.g * k3.g + v3.b * k3.b + v3.a * k3.a); +#endif + } + + w_offset += LOCAL_MEMORY_UNROLL_INCH * 4; + + barrier(); + } + + if (z < N) + { + const int remain = N - z; + + if (ly < remain) + { + const int zz = z + ly; + const int sz = zz / maxk; + const int k = zz - sz * maxk; + const int ky = k / kernel_w; + const int kx = k - ky * kernel_w; + + int v0p = 0; + int v1p = 0; + int v2p = 0; + int v3p = 0; + + if (elempack == 4) + { + const ivec4 v_offset = sz * psc(cstep) + (sys4 + ky * dilation_h) * psc(w) + sxs4 + kx * dilation_w; + + if (gx4.r < outsize) v0p = i8buffer_sm4(bottom_blob_int8_data_4, v_offset.r); + if (gx4.g < outsize) v1p = i8buffer_sm4(bottom_blob_int8_data_4, v_offset.g); + if (gx4.b < outsize) v2p = i8buffer_sm4(bottom_blob_int8_data_4, v_offset.b); + if (gx4.a < outsize) v3p = i8buffer_sm4(bottom_blob_int8_data_4, v_offset.a); + } + else // elempack == 1 + { + const int ch0 = sz * 4; + const ivec4 base_spatial = (sys4 + ky * dilation_h) * psc(w) + sxs4 + kx * dilation_w; + + if (gx4.r < outsize) + { + ivec4 v = ivec4(0); + v.r = i8buffer_ld1(bottom_blob_int8_data_1, ch0 * psc(cstep) + base_spatial.r); + if (ch0 + 1 < num_input) v.g = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 1) * psc(cstep) + base_spatial.r); + if (ch0 + 2 < num_input) v.b = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 2) * psc(cstep) + base_spatial.r); + if (ch0 + 3 < num_input) v.a = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 3) * psc(cstep) + base_spatial.r); + + v0p = packInt4x8(v); + } + if (gx4.g < outsize) + { + ivec4 v = ivec4(0); + v.r = i8buffer_ld1(bottom_blob_int8_data_1, ch0 * psc(cstep) + base_spatial.g); + if (ch0 + 1 < num_input) v.g = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 1) * psc(cstep) + base_spatial.g); + if (ch0 + 2 < num_input) v.b = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 2) * psc(cstep) + base_spatial.g); + if (ch0 + 3 < num_input) v.a = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 3) * psc(cstep) + base_spatial.g); + + v1p = packInt4x8(v); + } + if (gx4.b < outsize) + { + ivec4 v = ivec4(0); + v.r = i8buffer_ld1(bottom_blob_int8_data_1, ch0 * psc(cstep) + base_spatial.b); + if (ch0 + 1 < num_input) v.g = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 1) * psc(cstep) + base_spatial.b); + if (ch0 + 2 < num_input) v.b = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 2) * psc(cstep) + base_spatial.b); + if (ch0 + 3 < num_input) v.a = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 3) * psc(cstep) + base_spatial.b); + + v2p = packInt4x8(v); + } + if (gx4.a < outsize) + { + ivec4 v = ivec4(0); + v.r = i8buffer_ld1(bottom_blob_int8_data_1, ch0 * psc(cstep) + base_spatial.a); + if (ch0 + 1 < num_input) v.g = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 1) * psc(cstep) + base_spatial.a); + if (ch0 + 2 < num_input) v.b = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 2) * psc(cstep) + base_spatial.a); + if (ch0 + 3 < num_input) v.a = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 3) * psc(cstep) + base_spatial.a); + + v3p = packInt4x8(v); + } + } + + tmp_v[lx][ly] = ivec4(v0p, v1p, v2p, v3p); + } + + if (lx < remain) + { + tmp_k[ly][lx] = weight_data[w_offset / 4 + lx]; + } + + barrier(); + + for (int z4 = 0; z4 < remain; z4++) + { + const ivec4 vp = tmp_v[lx][z4]; + const int v0p = vp.r; + const int v1p = vp.g; + const int v2p = vp.b; + const int v3p = vp.a; + + const ivec4 kp = tmp_k[ly][z4]; + const int k0p = kp.r; + const int k1p = kp.g; + const int k2p = kp.b; + const int k3p = kp.a; + +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated + sum0 += ivec4(dotPacked4x8EXT(v0p, k0p), dotPacked4x8EXT(v0p, k1p), dotPacked4x8EXT(v0p, k2p), dotPacked4x8EXT(v0p, k3p)); + sum1 += ivec4(dotPacked4x8EXT(v1p, k0p), dotPacked4x8EXT(v1p, k1p), dotPacked4x8EXT(v1p, k2p), dotPacked4x8EXT(v1p, k3p)); + sum2 += ivec4(dotPacked4x8EXT(v2p, k0p), dotPacked4x8EXT(v2p, k1p), dotPacked4x8EXT(v2p, k2p), dotPacked4x8EXT(v2p, k3p)); + sum3 += ivec4(dotPacked4x8EXT(v3p, k0p), dotPacked4x8EXT(v3p, k1p), dotPacked4x8EXT(v3p, k2p), dotPacked4x8EXT(v3p, k3p)); +#else + const ivec4 v0 = unpackInt4x8(v0p); + const ivec4 v1 = unpackInt4x8(v1p); + const ivec4 v2 = unpackInt4x8(v2p); + const ivec4 v3 = unpackInt4x8(v3p); + + const ivec4 k0 = unpackInt4x8(k0p); + const ivec4 k1 = unpackInt4x8(k1p); + const ivec4 k2 = unpackInt4x8(k2p); + const ivec4 k3 = unpackInt4x8(k3p); + + sum0 += ivec4(v0.r * k0.r + v0.g * k0.g + v0.b * k0.b + v0.a * k0.a, + v0.r * k1.r + v0.g * k1.g + v0.b * k1.b + v0.a * k1.a, + v0.r * k2.r + v0.g * k2.g + v0.b * k2.b + v0.a * k2.a, + v0.r * k3.r + v0.g * k3.g + v0.b * k3.b + v0.a * k3.a); + sum1 += ivec4(v1.r * k0.r + v1.g * k0.g + v1.b * k0.b + v1.a * k0.a, + v1.r * k1.r + v1.g * k1.g + v1.b * k1.b + v1.a * k1.a, + v1.r * k2.r + v1.g * k2.g + v1.b * k2.b + v1.a * k2.a, + v1.r * k3.r + v1.g * k3.g + v1.b * k3.b + v1.a * k3.a); + sum2 += ivec4(v2.r * k0.r + v2.g * k0.g + v2.b * k0.b + v2.a * k0.a, + v2.r * k1.r + v2.g * k1.g + v2.b * k1.b + v2.a * k1.a, + v2.r * k2.r + v2.g * k2.g + v2.b * k2.b + v2.a * k2.a, + v2.r * k3.r + v2.g * k3.g + v2.b * k3.b + v2.a * k3.a); + sum3 += ivec4(v3.r * k0.r + v3.g * k0.g + v3.b * k0.b + v3.a * k0.a, + v3.r * k1.r + v3.g * k1.g + v3.b * k1.b + v3.a * k1.a, + v3.r * k2.r + v3.g * k2.g + v3.b * k2.b + v3.a * k2.a, + v3.r * k3.r + v3.g * k3.g + v3.b * k3.b + v3.a * k3.a); +#endif + } + } +#else + for (int z = 0; z < N; z++) + { + const int sz = z / maxk; + const int k = z - sz * maxk; + const int ky = k / kernel_w; + const int kx = k - ky * kernel_w; + + int v0p = 0; + int v1p = 0; + int v2p = 0; + int v3p = 0; + + if (elempack == 4) + { + const ivec4 v_offset = sz * psc(cstep) + (sys4 + ky * dilation_h) * psc(w) + sxs4 + kx * dilation_w; + + if (gx4.r < outsize) v0p = i8buffer_sm4(bottom_blob_int8_data_4, v_offset.r); + if (gx4.g < outsize) v1p = i8buffer_sm4(bottom_blob_int8_data_4, v_offset.g); + if (gx4.b < outsize) v2p = i8buffer_sm4(bottom_blob_int8_data_4, v_offset.b); + if (gx4.a < outsize) v3p = i8buffer_sm4(bottom_blob_int8_data_4, v_offset.a); + } + else // elempack == 1 + { + const int ch0 = sz * 4; + const ivec4 base_spatial = (sys4 + ky * dilation_h) * psc(w) + sxs4 + kx * dilation_w; + + if (gx4.r < outsize) + { + ivec4 v = ivec4(0); + v.r = i8buffer_ld1(bottom_blob_int8_data_1, ch0 * psc(cstep) + base_spatial.r); + if (ch0 + 1 < num_input) v.g = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 1) * psc(cstep) + base_spatial.r); + if (ch0 + 2 < num_input) v.b = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 2) * psc(cstep) + base_spatial.r); + if (ch0 + 3 < num_input) v.a = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 3) * psc(cstep) + base_spatial.r); + v0p = packInt4x8(v); + } + if (gx4.g < outsize) + { + ivec4 v = ivec4(0); + v.r = i8buffer_ld1(bottom_blob_int8_data_1, ch0 * psc(cstep) + base_spatial.g); + if (ch0 + 1 < num_input) v.g = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 1) * psc(cstep) + base_spatial.g); + if (ch0 + 2 < num_input) v.b = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 2) * psc(cstep) + base_spatial.g); + if (ch0 + 3 < num_input) v.a = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 3) * psc(cstep) + base_spatial.g); + v1p = packInt4x8(v); + } + if (gx4.b < outsize) + { + ivec4 v = ivec4(0); + v.r = i8buffer_ld1(bottom_blob_int8_data_1, ch0 * psc(cstep) + base_spatial.b); + if (ch0 + 1 < num_input) v.g = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 1) * psc(cstep) + base_spatial.b); + if (ch0 + 2 < num_input) v.b = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 2) * psc(cstep) + base_spatial.b); + if (ch0 + 3 < num_input) v.a = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 3) * psc(cstep) + base_spatial.b); + v2p = packInt4x8(v); + } + if (gx4.a < outsize) + { + ivec4 v = ivec4(0); + v.r = i8buffer_ld1(bottom_blob_int8_data_1, ch0 * psc(cstep) + base_spatial.a); + if (ch0 + 1 < num_input) v.g = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 1) * psc(cstep) + base_spatial.a); + if (ch0 + 2 < num_input) v.b = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 2) * psc(cstep) + base_spatial.a); + if (ch0 + 3 < num_input) v.a = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 3) * psc(cstep) + base_spatial.a); + v3p = packInt4x8(v); + } + } + + const ivec4 kp = weight_data[w_offset / 4]; + const int k0p = kp.r; + const int k1p = kp.g; + const int k2p = kp.b; + const int k3p = kp.a; + +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated + sum0 += ivec4(dotPacked4x8EXT(v0p, k0p), dotPacked4x8EXT(v0p, k1p), dotPacked4x8EXT(v0p, k2p), dotPacked4x8EXT(v0p, k3p)); + sum1 += ivec4(dotPacked4x8EXT(v1p, k0p), dotPacked4x8EXT(v1p, k1p), dotPacked4x8EXT(v1p, k2p), dotPacked4x8EXT(v1p, k3p)); + sum2 += ivec4(dotPacked4x8EXT(v2p, k0p), dotPacked4x8EXT(v2p, k1p), dotPacked4x8EXT(v2p, k2p), dotPacked4x8EXT(v2p, k3p)); + sum3 += ivec4(dotPacked4x8EXT(v3p, k0p), dotPacked4x8EXT(v3p, k1p), dotPacked4x8EXT(v3p, k2p), dotPacked4x8EXT(v3p, k3p)); +#else + const ivec4 v0 = unpackInt4x8(v0p); + const ivec4 v1 = unpackInt4x8(v1p); + const ivec4 v2 = unpackInt4x8(v2p); + const ivec4 v3 = unpackInt4x8(v3p); + + const ivec4 k0 = unpackInt4x8(kp.r); + const ivec4 k1 = unpackInt4x8(kp.g); + const ivec4 k2 = unpackInt4x8(kp.b); + const ivec4 k3 = unpackInt4x8(kp.a); + + sum0 += ivec4(v0.r * k0.r + v0.g * k0.g + v0.b * k0.b + v0.a * k0.a, + v0.r * k1.r + v0.g * k1.g + v0.b * k1.b + v0.a * k1.a, + v0.r * k2.r + v0.g * k2.g + v0.b * k2.b + v0.a * k2.a, + v0.r * k3.r + v0.g * k3.g + v0.b * k3.b + v0.a * k3.a); + sum1 += ivec4(v1.r * k0.r + v1.g * k0.g + v1.b * k0.b + v1.a * k0.a, + v1.r * k1.r + v1.g * k1.g + v1.b * k1.b + v1.a * k1.a, + v1.r * k2.r + v1.g * k2.g + v1.b * k2.b + v1.a * k2.a, + v1.r * k3.r + v1.g * k3.g + v1.b * k3.b + v1.a * k3.a); + sum2 += ivec4(v2.r * k0.r + v2.g * k0.g + v2.b * k0.b + v2.a * k0.a, + v2.r * k1.r + v2.g * k1.g + v2.b * k1.b + v2.a * k1.a, + v2.r * k2.r + v2.g * k2.g + v2.b * k2.b + v2.a * k2.a, + v2.r * k3.r + v2.g * k3.g + v2.b * k3.b + v2.a * k3.a); + sum3 += ivec4(v3.r * k0.r + v3.g * k0.g + v3.b * k0.b + v3.a * k0.a, + v3.r * k1.r + v3.g * k1.g + v3.b * k1.b + v3.a * k1.a, + v3.r * k2.r + v3.g * k2.g + v3.b * k2.b + v3.a * k2.a, + v3.r * k3.r + v3.g * k3.g + v3.b * k3.b + v3.a * k3.a); +#endif + + w_offset += 4; + } +#endif + +#if NCNN_shader_local_memory + if (gx >= outsize || gy >= psc(outc)) + return; +#endif + + const int base_ch = gy * 4; + const int nout = num_output; + const vec4 descale = weight_descales_data[gy]; + + vec4 sumfp32_0 = vec4(sum0) * descale; + vec4 sumfp32_1 = vec4(sum1) * descale; + vec4 sumfp32_2 = vec4(sum2) * descale; + vec4 sumfp32_3 = vec4(sum3) * descale; + + if (bias_term == 1) + { + const vec4 bias = bias_data[gy]; + + sumfp32_0 += bias; + sumfp32_1 += bias; + sumfp32_2 += bias; + sumfp32_3 += bias; + } + + sumfp32_0 = vec4(activation_afpvec4(afpvec4(sumfp32_0), activation_type, activation_param_0, activation_param_1)); + sumfp32_1 = vec4(activation_afpvec4(afpvec4(sumfp32_1), activation_type, activation_param_0, activation_param_1)); + sumfp32_2 = vec4(activation_afpvec4(afpvec4(sumfp32_2), activation_type, activation_param_0, activation_param_1)); + sumfp32_3 = vec4(activation_afpvec4(afpvec4(sumfp32_3), activation_type, activation_param_0, activation_param_1)); + + if (use_int8_requantize == 1) + { + const float top_scale = buffer_ld1(top_scales_data, 0); + sumfp32_0 *= top_scale; + sumfp32_1 *= top_scale; + sumfp32_2 *= top_scale; + sumfp32_3 *= top_scale; + + const ivec4 v0 = float2int8vec4(sumfp32_0); + const ivec4 v1 = float2int8vec4(sumfp32_1); + const ivec4 v2 = float2int8vec4(sumfp32_2); + const ivec4 v3 = float2int8vec4(sumfp32_3); + + if (out_elempack == 4) + { + const int gi = gy * psc(outcstep) + gx; + + i8buffer_st4(top_blob_int8_data_4, gi, v0); + if (gx + 1 < outsize) i8buffer_st4(top_blob_int8_data_4, gi + 1, v1); + if (gx + 2 < outsize) i8buffer_st4(top_blob_int8_data_4, gi + 2, v2); + if (gx + 3 < outsize) i8buffer_st4(top_blob_int8_data_4, gi + 3, v3); + } + else // out_elempack == 1 + { + const int channel_step = psc(outcstep) / 4; + const int gi = gy * psc(outcstep) + gx; + + i8buffer_st1(top_blob_int8_data_1, gi, v0.r); + if (base_ch + 1 < nout) i8buffer_st1(top_blob_int8_data_1, gi + channel_step, v0.g); + if (base_ch + 2 < nout) i8buffer_st1(top_blob_int8_data_1, gi + channel_step * 2, v0.b); + if (base_ch + 3 < nout) i8buffer_st1(top_blob_int8_data_1, gi + channel_step * 3, v0.a); + + if (gx + 1 < outsize) + { + i8buffer_st1(top_blob_int8_data_1, gi + 1, v1.r); + if (base_ch + 1 < nout) i8buffer_st1(top_blob_int8_data_1, gi + 1 + channel_step, v1.g); + if (base_ch + 2 < nout) i8buffer_st1(top_blob_int8_data_1, gi + 1 + channel_step * 2, v1.b); + if (base_ch + 3 < nout) i8buffer_st1(top_blob_int8_data_1, gi + 1 + channel_step * 3, v1.a); + } + if (gx + 2 < outsize) + { + i8buffer_st1(top_blob_int8_data_1, gi + 2, v2.r); + if (base_ch + 1 < nout) i8buffer_st1(top_blob_int8_data_1, gi + 2 + channel_step, v2.g); + if (base_ch + 2 < nout) i8buffer_st1(top_blob_int8_data_1, gi + 2 + channel_step * 2, v2.b); + if (base_ch + 3 < nout) i8buffer_st1(top_blob_int8_data_1, gi + 2 + channel_step * 3, v2.a); + } + if (gx + 3 < outsize) + { + i8buffer_st1(top_blob_int8_data_1, gi + 3, v3.r); + if (base_ch + 1 < nout) i8buffer_st1(top_blob_int8_data_1, gi + 3 + channel_step, v3.g); + if (base_ch + 2 < nout) i8buffer_st1(top_blob_int8_data_1, gi + 3 + channel_step * 2, v3.b); + if (base_ch + 3 < nout) i8buffer_st1(top_blob_int8_data_1, gi + 3 + channel_step * 3, v3.a); + } + } + } + else + { + if (out_elempack == 4) + { + const int gi = gy * psc(outcstep) + gx; + + buffer_st4(top_blob_data_4, gi, afpvec4(sumfp32_0)); + if (gx + 1 < outsize) buffer_st4(top_blob_data_4, gi + 1, afpvec4(sumfp32_1)); + if (gx + 2 < outsize) buffer_st4(top_blob_data_4, gi + 2, afpvec4(sumfp32_2)); + if (gx + 3 < outsize) buffer_st4(top_blob_data_4, gi + 3, afpvec4(sumfp32_3)); + } + else // out_elempack == 1 + { + const int channel_step = psc(outcstep) / 4; + const int gi = gy * psc(outcstep) + gx; + + buffer_st1(top_blob_data_1, gi, afp(sumfp32_0.r)); + if (base_ch + 1 < nout) buffer_st1(top_blob_data_1, gi + channel_step, afp(sumfp32_0.g)); + if (base_ch + 2 < nout) buffer_st1(top_blob_data_1, gi + channel_step * 2, afp(sumfp32_0.b)); + if (base_ch + 3 < nout) buffer_st1(top_blob_data_1, gi + channel_step * 3, afp(sumfp32_0.a)); + + if (gx + 1 < outsize) + { + buffer_st1(top_blob_data_1, gi + 1, afp(sumfp32_1.r)); + if (base_ch + 1 < nout) buffer_st1(top_blob_data_1, gi + 1 + channel_step, afp(sumfp32_1.g)); + if (base_ch + 2 < nout) buffer_st1(top_blob_data_1, gi + 1 + channel_step * 2, afp(sumfp32_1.b)); + if (base_ch + 3 < nout) buffer_st1(top_blob_data_1, gi + 1 + channel_step * 3, afp(sumfp32_1.a)); + } + if (gx + 2 < outsize) + { + buffer_st1(top_blob_data_1, gi + 2, afp(sumfp32_2.r)); + if (base_ch + 1 < nout) buffer_st1(top_blob_data_1, gi + 2 + channel_step, afp(sumfp32_2.g)); + if (base_ch + 2 < nout) buffer_st1(top_blob_data_1, gi + 2 + channel_step * 2, afp(sumfp32_2.b)); + if (base_ch + 3 < nout) buffer_st1(top_blob_data_1, gi + 2 + channel_step * 3, afp(sumfp32_2.a)); + } + if (gx + 3 < outsize) + { + buffer_st1(top_blob_data_1, gi + 3, afp(sumfp32_3.r)); + if (base_ch + 1 < nout) buffer_st1(top_blob_data_1, gi + 3 + channel_step, afp(sumfp32_3.g)); + if (base_ch + 2 < nout) buffer_st1(top_blob_data_1, gi + 3 + channel_step * 2, afp(sumfp32_3.b)); + if (base_ch + 3 < nout) buffer_st1(top_blob_data_1, gi + 3 + channel_step * 3, afp(sumfp32_3.a)); + } + } + } +} diff --git a/src/layer/vulkan/shader/convolution_packed_int8.comp b/src/layer/vulkan/shader/convolution_packed_int8.comp new file mode 100644 index 000000000000..42faac9b33dc --- /dev/null +++ b/src/layer/vulkan/shader/convolution_packed_int8.comp @@ -0,0 +1,439 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated +#extension GL_EXT_integer_dot_product : require +#endif + +#extension GL_GOOGLE_include_directive : enable +#include "vulkan_activation.comp" + +layout(constant_id = 0) const int kernel_w = 1; +layout(constant_id = 1) const int kernel_h = 1; +layout(constant_id = 2) const int dilation_w = 1; +layout(constant_id = 3) const int dilation_h = 1; +layout(constant_id = 4) const int stride_w = 1; +layout(constant_id = 5) const int stride_h = 1; +layout(constant_id = 6) const int bias_term = 0; +layout(constant_id = 7) const int activation_type = 0; +layout(constant_id = 8) const float activation_param_0 = 0; +layout(constant_id = 9) const float activation_param_1 = 0; +layout(constant_id = 10) const int use_int8_requantize = 0; +layout(constant_id = 11) const int elempack = 1; +layout(constant_id = 12) const int out_elempack = 1; + +#define shape_constant_id_offset 13 +layout(constant_id = shape_constant_id_offset + 0) const int w = 0; +layout(constant_id = shape_constant_id_offset + 1) const int h = 0; +layout(constant_id = shape_constant_id_offset + 2) const int c = 0; +layout(constant_id = shape_constant_id_offset + 3) const int cstep = 0; +layout(constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout(constant_id = shape_constant_id_offset + 5) const int outh = 0; +layout(constant_id = shape_constant_id_offset + 6) const int outc = 0; +layout(constant_id = shape_constant_id_offset + 7) const int outcstep = 0; +layout(constant_id = shape_constant_id_offset + 8) const int num_output = 0; +layout(constant_id = shape_constant_id_offset + 9) const int num_input = 0; + +// scalar view (for pack1 input/output access) +layout(binding = 0) readonly buffer bottom_blob_1 { sint8 bottom_blob_int8_data_1[]; }; +layout(binding = 1) writeonly buffer top_blob_1 { sfp top_blob_data_1[]; }; + +// vec4 view (for pack4 input/output access, weight/bias always vec4) +layout(binding = 2) readonly buffer bottom_blob_4 { sint8vec4 bottom_blob_int8_data_4[]; }; +layout(binding = 3) writeonly buffer top_blob_4 { sfpvec4 top_blob_data_4[]; }; +layout(binding = 4) readonly buffer weight_blob { sint8vec4 weight_data[]; }; +layout(binding = 5) readonly buffer bias_blob { vec4 bias_data[]; }; +layout(binding = 6) readonly buffer weight_descales_blob { vec4 weight_descales_data[]; }; +layout(binding = 7) readonly buffer top_scales_blob { sfp top_scales_data[]; }; +layout(binding = 8) writeonly buffer top_blob_int8_1 { sint8 top_blob_int8_data_1[]; }; +layout(binding = 9) writeonly buffer top_blob_int8_4 { sint8vec4 top_blob_int8_data_4[]; }; + +layout(push_constant) uniform parameter +{ + int w; + int h; + int c; + int cstep; + + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) * 2; + int gy = int(gl_GlobalInvocationID.y) * 2; + int gz = int(gl_GlobalInvocationID.z) * 2; + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + int maxk = kernel_w * kernel_h; + ivec2 gx2 = gx + ivec2(0, 1); + ivec2 gy2 = gy + ivec2(0, 1); + ivec2 gz2 = gz + ivec2(0, 1); + + ivec4 sum0 = ivec4(0); + ivec4 sum1 = ivec4(0); + ivec4 sum2 = ivec4(0); + ivec4 sum3 = ivec4(0); + ivec4 sum4 = ivec4(0); + ivec4 sum5 = ivec4(0); + ivec4 sum6 = ivec4(0); + ivec4 sum7 = ivec4(0); + + for (int z = 0; z < psc(c); z++) + { + if (elempack == 4) + { + ivec4 v_offset; + v_offset.rg = z * psc(cstep) + gy2.x * stride_h * psc(w) + gx2 * stride_w; + v_offset.ba = z * psc(cstep) + gy2.y * stride_h * psc(w) + gx2 * stride_w; + + ivec2 w_offset = (gz2 * psc(c) + z) * maxk * 4; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + int v0p = i8buffer_sm4(bottom_blob_int8_data_4, v_offset.r + x * dilation_w); + int v1p = gx + 1 < psc(outw) ? i8buffer_sm4(bottom_blob_int8_data_4, v_offset.g + x * dilation_w) : 0; + int v2p = gy + 1 < psc(outh) ? i8buffer_sm4(bottom_blob_int8_data_4, v_offset.b + x * dilation_w) : 0; + int v3p = gx + 1 < psc(outw) && gy + 1 < psc(outh) ? i8buffer_sm4(bottom_blob_int8_data_4, v_offset.a + x * dilation_w) : 0; + + int k0p = i8buffer_sm4(weight_data, w_offset.x + x * 4 + 0); + int k1p = i8buffer_sm4(weight_data, w_offset.x + x * 4 + 1); + int k2p = i8buffer_sm4(weight_data, w_offset.x + x * 4 + 2); + int k3p = i8buffer_sm4(weight_data, w_offset.x + x * 4 + 3); + int k4p = i8buffer_sm4(weight_data, w_offset.y + x * 4 + 0); + int k5p = i8buffer_sm4(weight_data, w_offset.y + x * 4 + 1); + int k6p = i8buffer_sm4(weight_data, w_offset.y + x * 4 + 2); + int k7p = i8buffer_sm4(weight_data, w_offset.y + x * 4 + 3); + +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated + sum0 += ivec4(dotPacked4x8EXT(v0p, k0p), dotPacked4x8EXT(v0p, k1p), dotPacked4x8EXT(v0p, k2p), dotPacked4x8EXT(v0p, k3p)); + sum1 += ivec4(dotPacked4x8EXT(v1p, k0p), dotPacked4x8EXT(v1p, k1p), dotPacked4x8EXT(v1p, k2p), dotPacked4x8EXT(v1p, k3p)); + sum2 += ivec4(dotPacked4x8EXT(v2p, k0p), dotPacked4x8EXT(v2p, k1p), dotPacked4x8EXT(v2p, k2p), dotPacked4x8EXT(v2p, k3p)); + sum3 += ivec4(dotPacked4x8EXT(v3p, k0p), dotPacked4x8EXT(v3p, k1p), dotPacked4x8EXT(v3p, k2p), dotPacked4x8EXT(v3p, k3p)); + sum4 += ivec4(dotPacked4x8EXT(v0p, k4p), dotPacked4x8EXT(v0p, k5p), dotPacked4x8EXT(v0p, k6p), dotPacked4x8EXT(v0p, k7p)); + sum5 += ivec4(dotPacked4x8EXT(v1p, k4p), dotPacked4x8EXT(v1p, k5p), dotPacked4x8EXT(v1p, k6p), dotPacked4x8EXT(v1p, k7p)); + sum6 += ivec4(dotPacked4x8EXT(v2p, k4p), dotPacked4x8EXT(v2p, k5p), dotPacked4x8EXT(v2p, k6p), dotPacked4x8EXT(v2p, k7p)); + sum7 += ivec4(dotPacked4x8EXT(v3p, k4p), dotPacked4x8EXT(v3p, k5p), dotPacked4x8EXT(v3p, k6p), dotPacked4x8EXT(v3p, k7p)); +#else + ivec4 v0 = unpackInt4x8(v0p); + ivec4 v1 = unpackInt4x8(v1p); + ivec4 v2 = unpackInt4x8(v2p); + ivec4 v3 = unpackInt4x8(v3p); + ivec4 k0 = unpackInt4x8(k0p); + ivec4 k1 = unpackInt4x8(k1p); + ivec4 k2 = unpackInt4x8(k2p); + ivec4 k3 = unpackInt4x8(k3p); + ivec4 k4 = unpackInt4x8(k4p); + ivec4 k5 = unpackInt4x8(k5p); + ivec4 k6 = unpackInt4x8(k6p); + ivec4 k7 = unpackInt4x8(k7p); + + sum0.r += v0.r * k0.r + v0.g * k0.g + v0.b * k0.b + v0.a * k0.a; + sum0.g += v0.r * k1.r + v0.g * k1.g + v0.b * k1.b + v0.a * k1.a; + sum0.b += v0.r * k2.r + v0.g * k2.g + v0.b * k2.b + v0.a * k2.a; + sum0.a += v0.r * k3.r + v0.g * k3.g + v0.b * k3.b + v0.a * k3.a; + sum1.r += v1.r * k0.r + v1.g * k0.g + v1.b * k0.b + v1.a * k0.a; + sum1.g += v1.r * k1.r + v1.g * k1.g + v1.b * k1.b + v1.a * k1.a; + sum1.b += v1.r * k2.r + v1.g * k2.g + v1.b * k2.b + v1.a * k2.a; + sum1.a += v1.r * k3.r + v1.g * k3.g + v1.b * k3.b + v1.a * k3.a; + sum2.r += v2.r * k0.r + v2.g * k0.g + v2.b * k0.b + v2.a * k0.a; + sum2.g += v2.r * k1.r + v2.g * k1.g + v2.b * k1.b + v2.a * k1.a; + sum2.b += v2.r * k2.r + v2.g * k2.g + v2.b * k2.b + v2.a * k2.a; + sum2.a += v2.r * k3.r + v2.g * k3.g + v2.b * k3.b + v2.a * k3.a; + sum3.r += v3.r * k0.r + v3.g * k0.g + v3.b * k0.b + v3.a * k0.a; + sum3.g += v3.r * k1.r + v3.g * k1.g + v3.b * k1.b + v3.a * k1.a; + sum3.b += v3.r * k2.r + v3.g * k2.g + v3.b * k2.b + v3.a * k2.a; + sum3.a += v3.r * k3.r + v3.g * k3.g + v3.b * k3.b + v3.a * k3.a; + sum4.r += v0.r * k4.r + v0.g * k4.g + v0.b * k4.b + v0.a * k4.a; + sum4.g += v0.r * k5.r + v0.g * k5.g + v0.b * k5.b + v0.a * k5.a; + sum4.b += v0.r * k6.r + v0.g * k6.g + v0.b * k6.b + v0.a * k6.a; + sum4.a += v0.r * k7.r + v0.g * k7.g + v0.b * k7.b + v0.a * k7.a; + sum5.r += v1.r * k4.r + v1.g * k4.g + v1.b * k4.b + v1.a * k4.a; + sum5.g += v1.r * k5.r + v1.g * k5.g + v1.b * k5.b + v1.a * k5.a; + sum5.b += v1.r * k6.r + v1.g * k6.g + v1.b * k6.b + v1.a * k6.a; + sum5.a += v1.r * k7.r + v1.g * k7.g + v1.b * k7.b + v1.a * k7.a; + sum6.r += v2.r * k4.r + v2.g * k4.g + v2.b * k4.b + v2.a * k4.a; + sum6.g += v2.r * k5.r + v2.g * k5.g + v2.b * k5.b + v2.a * k5.a; + sum6.b += v2.r * k6.r + v2.g * k6.g + v2.b * k6.b + v2.a * k6.a; + sum6.a += v2.r * k7.r + v2.g * k7.g + v2.b * k7.b + v2.a * k7.a; + sum7.r += v3.r * k4.r + v3.g * k4.g + v3.b * k4.b + v3.a * k4.a; + sum7.g += v3.r * k5.r + v3.g * k5.g + v3.b * k5.b + v3.a * k5.a; + sum7.b += v3.r * k6.r + v3.g * k6.g + v3.b * k6.b + v3.a * k6.a; + sum7.a += v3.r * k7.r + v3.g * k7.g + v3.b * k7.b + v3.a * k7.a; +#endif + } + + v_offset += dilation_h * psc(w); + w_offset += kernel_w * 4; + } + } + else // elempack == 1 + { + ivec2 w_offset = (gz2 * psc(c) + z) * maxk; + + for (int y = 0; y < kernel_h; y++) + { + ivec4 v_offset; + v_offset.rg = z * psc(cstep) + gy2.x * stride_h * psc(w) + gx2 * stride_w + y * dilation_h * psc(w); + v_offset.ba = z * psc(cstep) + gy2.y * stride_h * psc(w) + gx2 * stride_w + y * dilation_h * psc(w); + + for (int x = 0; x < kernel_w; x++) + { + int v0 = i8buffer_ld1(bottom_blob_int8_data_1, v_offset.r + x * dilation_w); + int v1 = gx + 1 < psc(outw) ? i8buffer_ld1(bottom_blob_int8_data_1, v_offset.g + x * dilation_w) : 0; + int v2 = gy + 1 < psc(outh) ? i8buffer_ld1(bottom_blob_int8_data_1, v_offset.b + x * dilation_w) : 0; + int v3 = gx + 1 < psc(outw) && gy + 1 < psc(outh) ? i8buffer_ld1(bottom_blob_int8_data_1, v_offset.a + x * dilation_w) : 0; + + int k0p = i8buffer_sm4(weight_data, w_offset.x + x); + int k1p = i8buffer_sm4(weight_data, w_offset.y + x); + + ivec4 k0 = unpackInt4x8(k0p); + ivec4 k1 = unpackInt4x8(k1p); + + sum0 += v0 * k0; + sum1 += v1 * k0; + sum2 += v2 * k0; + sum3 += v3 * k0; + sum4 += v0 * k1; + sum5 += v1 * k1; + sum6 += v2 * k1; + sum7 += v3 * k1; + } + + w_offset += kernel_w; + } + } + } + + int nout = num_output; + ivec2 base_ch = gz2 * 4; + + const vec4 descale0 = weight_descales_data[gz2.x]; + const vec4 descale1 = weight_descales_data[gz2.y]; + + vec4 sumfp32_0 = vec4(sum0) * descale0; + vec4 sumfp32_1 = vec4(sum1) * descale0; + vec4 sumfp32_2 = vec4(sum2) * descale0; + vec4 sumfp32_3 = vec4(sum3) * descale0; + vec4 sumfp32_4 = vec4(sum4) * descale1; + vec4 sumfp32_5 = vec4(sum5) * descale1; + vec4 sumfp32_6 = vec4(sum6) * descale1; + vec4 sumfp32_7 = vec4(sum7) * descale1; + + if (bias_term == 1) + { + vec4 bias0 = bias_data[gz2.x]; + vec4 bias1 = bias_data[gz2.y]; + + sumfp32_0 += bias0; + sumfp32_1 += bias0; + sumfp32_2 += bias0; + sumfp32_3 += bias0; + sumfp32_4 += bias1; + sumfp32_5 += bias1; + sumfp32_6 += bias1; + sumfp32_7 += bias1; + } + + sumfp32_0 = vec4(activation_afpvec4(afpvec4(sumfp32_0), activation_type, activation_param_0, activation_param_1)); + sumfp32_1 = vec4(activation_afpvec4(afpvec4(sumfp32_1), activation_type, activation_param_0, activation_param_1)); + sumfp32_2 = vec4(activation_afpvec4(afpvec4(sumfp32_2), activation_type, activation_param_0, activation_param_1)); + sumfp32_3 = vec4(activation_afpvec4(afpvec4(sumfp32_3), activation_type, activation_param_0, activation_param_1)); + sumfp32_4 = vec4(activation_afpvec4(afpvec4(sumfp32_4), activation_type, activation_param_0, activation_param_1)); + sumfp32_5 = vec4(activation_afpvec4(afpvec4(sumfp32_5), activation_type, activation_param_0, activation_param_1)); + sumfp32_6 = vec4(activation_afpvec4(afpvec4(sumfp32_6), activation_type, activation_param_0, activation_param_1)); + sumfp32_7 = vec4(activation_afpvec4(afpvec4(sumfp32_7), activation_type, activation_param_0, activation_param_1)); + + if (use_int8_requantize == 1) + { + float top_scale = buffer_ld1(top_scales_data, 0); + sumfp32_0 *= top_scale; + sumfp32_1 *= top_scale; + sumfp32_2 *= top_scale; + sumfp32_3 *= top_scale; + sumfp32_4 *= top_scale; + sumfp32_5 *= top_scale; + sumfp32_6 *= top_scale; + sumfp32_7 *= top_scale; + + ivec4 v0 = float2int8vec4(sumfp32_0); + ivec4 v1 = float2int8vec4(sumfp32_1); + ivec4 v2 = float2int8vec4(sumfp32_2); + ivec4 v3 = float2int8vec4(sumfp32_3); + ivec4 v4 = float2int8vec4(sumfp32_4); + ivec4 v5 = float2int8vec4(sumfp32_5); + ivec4 v6 = float2int8vec4(sumfp32_6); + ivec4 v7 = float2int8vec4(sumfp32_7); + + if (out_elempack == 4) + { + ivec2 gi = gz2 * psc(outcstep) + gy * psc(outw) + gx; + + i8buffer_st4(top_blob_int8_data_4, gi.x, v0); + if (gx + 1 < psc(outw)) i8buffer_st4(top_blob_int8_data_4, gi.x + 1, v1); + if (gy + 1 < psc(outh)) i8buffer_st4(top_blob_int8_data_4, gi.x + psc(outw), v2); + if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) i8buffer_st4(top_blob_int8_data_4, gi.x + psc(outw) + 1, v3); + if (gz + 1 < psc(outc)) + { + i8buffer_st4(top_blob_int8_data_4, gi.y, v4); + if (gx + 1 < psc(outw)) i8buffer_st4(top_blob_int8_data_4, gi.y + 1, v5); + if (gy + 1 < psc(outh)) i8buffer_st4(top_blob_int8_data_4, gi.y + psc(outw), v6); + if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) i8buffer_st4(top_blob_int8_data_4, gi.y + psc(outw) + 1, v7); + } + } + else // out_elempack == 1 + { + int channel_step = psc(outcstep) / 4; + ivec2 gi = gz2 * psc(outcstep) + gy * psc(outw) + gx; + + i8buffer_st1(top_blob_int8_data_1, gi.x, v0.r); + if (base_ch.x + 1 < nout) i8buffer_st1(top_blob_int8_data_1, gi.x + channel_step, v0.g); + if (base_ch.x + 2 < nout) i8buffer_st1(top_blob_int8_data_1, gi.x + channel_step * 2, v0.b); + if (base_ch.x + 3 < nout) i8buffer_st1(top_blob_int8_data_1, gi.x + channel_step * 3, v0.a); + if (gx + 1 < psc(outw)) + { + i8buffer_st1(top_blob_int8_data_1, gi.x + 1, v1.r); + if (base_ch.x + 1 < nout) i8buffer_st1(top_blob_int8_data_1, gi.x + 1 + channel_step, v1.g); + if (base_ch.x + 2 < nout) i8buffer_st1(top_blob_int8_data_1, gi.x + 1 + channel_step * 2, v1.b); + if (base_ch.x + 3 < nout) i8buffer_st1(top_blob_int8_data_1, gi.x + 1 + channel_step * 3, v1.a); + } + if (gy + 1 < psc(outh)) + { + int gi2 = gi.x + psc(outw); + i8buffer_st1(top_blob_int8_data_1, gi2, v2.r); + if (base_ch.x + 1 < nout) i8buffer_st1(top_blob_int8_data_1, gi2 + channel_step, v2.g); + if (base_ch.x + 2 < nout) i8buffer_st1(top_blob_int8_data_1, gi2 + channel_step * 2, v2.b); + if (base_ch.x + 3 < nout) i8buffer_st1(top_blob_int8_data_1, gi2 + channel_step * 3, v2.a); + } + if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) + { + int gi3 = gi.x + psc(outw) + 1; + i8buffer_st1(top_blob_int8_data_1, gi3, v3.r); + if (base_ch.x + 1 < nout) i8buffer_st1(top_blob_int8_data_1, gi3 + channel_step, v3.g); + if (base_ch.x + 2 < nout) i8buffer_st1(top_blob_int8_data_1, gi3 + channel_step * 2, v3.b); + if (base_ch.x + 3 < nout) i8buffer_st1(top_blob_int8_data_1, gi3 + channel_step * 3, v3.a); + } + if (gz + 1 < psc(outc)) + { + if (base_ch.y < nout) i8buffer_st1(top_blob_int8_data_1, gi.y, v4.r); + if (base_ch.y + 1 < nout) i8buffer_st1(top_blob_int8_data_1, gi.y + channel_step, v4.g); + if (base_ch.y + 2 < nout) i8buffer_st1(top_blob_int8_data_1, gi.y + channel_step * 2, v4.b); + if (base_ch.y + 3 < nout) i8buffer_st1(top_blob_int8_data_1, gi.y + channel_step * 3, v4.a); + if (gx + 1 < psc(outw)) + { + if (base_ch.y < nout) i8buffer_st1(top_blob_int8_data_1, gi.y + 1, v5.r); + if (base_ch.y + 1 < nout) i8buffer_st1(top_blob_int8_data_1, gi.y + 1 + channel_step, v5.g); + if (base_ch.y + 2 < nout) i8buffer_st1(top_blob_int8_data_1, gi.y + 1 + channel_step * 2, v5.b); + if (base_ch.y + 3 < nout) i8buffer_st1(top_blob_int8_data_1, gi.y + 1 + channel_step * 3, v5.a); + } + if (gy + 1 < psc(outh)) + { + int gi6 = gi.y + psc(outw); + if (base_ch.y < nout) i8buffer_st1(top_blob_int8_data_1, gi6, v6.r); + if (base_ch.y + 1 < nout) i8buffer_st1(top_blob_int8_data_1, gi6 + channel_step, v6.g); + if (base_ch.y + 2 < nout) i8buffer_st1(top_blob_int8_data_1, gi6 + channel_step * 2, v6.b); + if (base_ch.y + 3 < nout) i8buffer_st1(top_blob_int8_data_1, gi6 + channel_step * 3, v6.a); + } + if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) + { + int gi7 = gi.y + psc(outw) + 1; + if (base_ch.y < nout) i8buffer_st1(top_blob_int8_data_1, gi7, v7.r); + if (base_ch.y + 1 < nout) i8buffer_st1(top_blob_int8_data_1, gi7 + channel_step, v7.g); + if (base_ch.y + 2 < nout) i8buffer_st1(top_blob_int8_data_1, gi7 + channel_step * 2, v7.b); + if (base_ch.y + 3 < nout) i8buffer_st1(top_blob_int8_data_1, gi7 + channel_step * 3, v7.a); + } + } + } + } + else + { + if (out_elempack == 4) + { + ivec2 gi = gz2 * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st4(top_blob_data_4, gi.x, afpvec4(sumfp32_0)); + if (gx + 1 < psc(outw)) buffer_st4(top_blob_data_4, gi.x + 1, afpvec4(sumfp32_1)); + if (gy + 1 < psc(outh)) buffer_st4(top_blob_data_4, gi.x + psc(outw), afpvec4(sumfp32_2)); + if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st4(top_blob_data_4, gi.x + psc(outw) + 1, afpvec4(sumfp32_3)); + if (gz + 1 < psc(outc)) + { + buffer_st4(top_blob_data_4, gi.y, afpvec4(sumfp32_4)); + if (gx + 1 < psc(outw)) buffer_st4(top_blob_data_4, gi.y + 1, afpvec4(sumfp32_5)); + if (gy + 1 < psc(outh)) buffer_st4(top_blob_data_4, gi.y + psc(outw), afpvec4(sumfp32_6)); + if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st4(top_blob_data_4, gi.y + psc(outw) + 1, afpvec4(sumfp32_7)); + } + } + else // out_elempack == 1 + { + int channel_step = psc(outcstep) / 4; + ivec2 gi = gz2 * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st1(top_blob_data_1, gi.x, afp(sumfp32_0.r)); + if (base_ch.x + 1 < nout) buffer_st1(top_blob_data_1, gi.x + channel_step, afp(sumfp32_0.g)); + if (base_ch.x + 2 < nout) buffer_st1(top_blob_data_1, gi.x + channel_step * 2, afp(sumfp32_0.b)); + if (base_ch.x + 3 < nout) buffer_st1(top_blob_data_1, gi.x + channel_step * 3, afp(sumfp32_0.a)); + if (gx + 1 < psc(outw)) + { + buffer_st1(top_blob_data_1, gi.x + 1, afp(sumfp32_1.r)); + if (base_ch.x + 1 < nout) buffer_st1(top_blob_data_1, gi.x + 1 + channel_step, afp(sumfp32_1.g)); + if (base_ch.x + 2 < nout) buffer_st1(top_blob_data_1, gi.x + 1 + channel_step * 2, afp(sumfp32_1.b)); + if (base_ch.x + 3 < nout) buffer_st1(top_blob_data_1, gi.x + 1 + channel_step * 3, afp(sumfp32_1.a)); + } + if (gy + 1 < psc(outh)) + { + int gi2 = gi.x + psc(outw); + buffer_st1(top_blob_data_1, gi2, afp(sumfp32_2.r)); + if (base_ch.x + 1 < nout) buffer_st1(top_blob_data_1, gi2 + channel_step, afp(sumfp32_2.g)); + if (base_ch.x + 2 < nout) buffer_st1(top_blob_data_1, gi2 + channel_step * 2, afp(sumfp32_2.b)); + if (base_ch.x + 3 < nout) buffer_st1(top_blob_data_1, gi2 + channel_step * 3, afp(sumfp32_2.a)); + } + if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) + { + int gi3 = gi.x + psc(outw) + 1; + buffer_st1(top_blob_data_1, gi3, afp(sumfp32_3.r)); + if (base_ch.x + 1 < nout) buffer_st1(top_blob_data_1, gi3 + channel_step, afp(sumfp32_3.g)); + if (base_ch.x + 2 < nout) buffer_st1(top_blob_data_1, gi3 + channel_step * 2, afp(sumfp32_3.b)); + if (base_ch.x + 3 < nout) buffer_st1(top_blob_data_1, gi3 + channel_step * 3, afp(sumfp32_3.a)); + } + if (gz + 1 < psc(outc)) + { + if (base_ch.y < nout) buffer_st1(top_blob_data_1, gi.y, afp(sumfp32_4.r)); + if (base_ch.y + 1 < nout) buffer_st1(top_blob_data_1, gi.y + channel_step, afp(sumfp32_4.g)); + if (base_ch.y + 2 < nout) buffer_st1(top_blob_data_1, gi.y + channel_step * 2, afp(sumfp32_4.b)); + if (base_ch.y + 3 < nout) buffer_st1(top_blob_data_1, gi.y + channel_step * 3, afp(sumfp32_4.a)); + if (gx + 1 < psc(outw)) + { + if (base_ch.y < nout) buffer_st1(top_blob_data_1, gi.y + 1, afp(sumfp32_5.r)); + if (base_ch.y + 1 < nout) buffer_st1(top_blob_data_1, gi.y + 1 + channel_step, afp(sumfp32_5.g)); + if (base_ch.y + 2 < nout) buffer_st1(top_blob_data_1, gi.y + 1 + channel_step * 2, afp(sumfp32_5.b)); + if (base_ch.y + 3 < nout) buffer_st1(top_blob_data_1, gi.y + 1 + channel_step * 3, afp(sumfp32_5.a)); + } + if (gy + 1 < psc(outh)) + { + int gi6 = gi.y + psc(outw); + if (base_ch.y < nout) buffer_st1(top_blob_data_1, gi6, afp(sumfp32_6.r)); + if (base_ch.y + 1 < nout) buffer_st1(top_blob_data_1, gi6 + channel_step, afp(sumfp32_6.g)); + if (base_ch.y + 2 < nout) buffer_st1(top_blob_data_1, gi6 + channel_step * 2, afp(sumfp32_6.b)); + if (base_ch.y + 3 < nout) buffer_st1(top_blob_data_1, gi6 + channel_step * 3, afp(sumfp32_6.a)); + } + if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) + { + int gi7 = gi.y + psc(outw) + 1; + if (base_ch.y < nout) buffer_st1(top_blob_data_1, gi7, afp(sumfp32_7.r)); + if (base_ch.y + 1 < nout) buffer_st1(top_blob_data_1, gi7 + channel_step, afp(sumfp32_7.g)); + if (base_ch.y + 2 < nout) buffer_st1(top_blob_data_1, gi7 + channel_step * 2, afp(sumfp32_7.b)); + if (base_ch.y + 3 < nout) buffer_st1(top_blob_data_1, gi7 + channel_step * 3, afp(sumfp32_7.a)); + } + } + } + } +} diff --git a/src/layer/vulkan/shader/convolutiondepthwise_group_packed_int8.comp b/src/layer/vulkan/shader/convolutiondepthwise_group_packed_int8.comp new file mode 100644 index 000000000000..ab4782d2da26 --- /dev/null +++ b/src/layer/vulkan/shader/convolutiondepthwise_group_packed_int8.comp @@ -0,0 +1,202 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated +#extension GL_EXT_integer_dot_product : require +#endif + +#extension GL_GOOGLE_include_directive : enable +#include "vulkan_activation.comp" + +layout(constant_id = 0) const int kernel_w = 1; +layout(constant_id = 1) const int kernel_h = 1; +layout(constant_id = 2) const int dilation_w = 1; +layout(constant_id = 3) const int dilation_h = 1; +layout(constant_id = 4) const int stride_w = 1; +layout(constant_id = 5) const int stride_h = 1; +layout(constant_id = 6) const int bias_term = 0; +layout(constant_id = 7) const int group = 1; +layout(constant_id = 8) const int activation_type = 0; +layout(constant_id = 9) const float activation_param_0 = 0; +layout(constant_id = 10) const float activation_param_1 = 0; +layout(constant_id = 11) const int use_int8_requantize = 0; +layout(constant_id = 12) const int elempack = 1; +layout(constant_id = 13) const int out_elempack = 1; +layout(constant_id = 14) const int num_output_g = 1; + +#define shape_constant_id_offset 15 +layout(constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout(constant_id = shape_constant_id_offset + 1) const int w = 0; +layout(constant_id = shape_constant_id_offset + 2) const int h = 0; +layout(constant_id = shape_constant_id_offset + 3) const int c = 0; +layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout(constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout(constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout(constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout(binding = 0) readonly buffer bottom_blob_1 { sint8 bottom_blob_int8_data_1[]; }; +layout(binding = 1) writeonly buffer top_blob_1 { sfp top_blob_data_1[]; }; +layout(binding = 2) readonly buffer bottom_blob_4 { sint8vec4 bottom_blob_int8_data_4[]; }; +layout(binding = 3) writeonly buffer top_blob_4 { sfpvec4 top_blob_data_4[]; }; +layout(binding = 4) readonly buffer weight_blob { sint8vec4 weight_data[]; }; +layout(binding = 5) readonly buffer bias_blob { vec4 bias_data[]; }; +layout(binding = 6) readonly buffer descales_blob { vec4 descales_data[]; }; +layout(binding = 7) readonly buffer top_scales_blob { sfpvec4 top_scales_data[]; }; +layout(binding = 8) writeonly buffer top_blob_int8_1 { sint8 top_blob_int8_data_1[]; }; +layout(binding = 9) writeonly buffer top_blob_int8_4 { sint8vec4 top_blob_int8_data_4[]; }; + +layout(push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + int outc_g = (num_output_g + 3) / 4; + int gg = gz / outc_g; + gz = gz - gg * outc_g; + + if (gx >= psc(outw) || gy >= psc(outh) || gg >= group) + return; + + int maxk = kernel_w * kernel_h; + int channels_g = psc(c) / group; + int outc_g_aligned = (outc_g + 7) / 8 * 8; + int gz_aligned = gg * outc_g_aligned + gz; + int top_z = gg * outc_g + gz; + + ivec4 sum = ivec4(0); + + // group convolution + for (int z = 0; z < channels_g; z++) + { + if (elempack == 4) + { + int v_offset = (gg * channels_g + z) * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; + int w_offset = (gz_aligned * channels_g + z) * maxk * 4; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + int v0p = i8buffer_sm4(bottom_blob_int8_data_4, v_offset + x * dilation_w); + int k0p = i8buffer_sm4(weight_data, w_offset + x * 4 + 0); + int k1p = i8buffer_sm4(weight_data, w_offset + x * 4 + 1); + int k2p = i8buffer_sm4(weight_data, w_offset + x * 4 + 2); + int k3p = i8buffer_sm4(weight_data, w_offset + x * 4 + 3); + +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated + sum += ivec4(dotPacked4x8EXT(v0p, k0p), dotPacked4x8EXT(v0p, k1p), dotPacked4x8EXT(v0p, k2p), dotPacked4x8EXT(v0p, k3p)); +#else + ivec4 v0 = unpackInt4x8(v0p); + ivec4 k0 = unpackInt4x8(k0p); + ivec4 k1 = unpackInt4x8(k1p); + ivec4 k2 = unpackInt4x8(k2p); + ivec4 k3 = unpackInt4x8(k3p); + + sum.r += v0.r * k0.r + v0.g * k0.g + v0.b * k0.b + v0.a * k0.a; + sum.g += v0.r * k1.r + v0.g * k1.g + v0.b * k1.b + v0.a * k1.a; + sum.b += v0.r * k2.r + v0.g * k2.g + v0.b * k2.b + v0.a * k2.a; + sum.a += v0.r * k3.r + v0.g * k3.g + v0.b * k3.b + v0.a * k3.a; +#endif + } + + v_offset += dilation_h * psc(w); + w_offset += kernel_w * 4; + } + } + else // elempack == 1 + { + int w_offset = (gz_aligned * channels_g + z) * maxk; + + for (int y = 0; y < kernel_h; y++) + { + int v_offset = (gg * channels_g + z) * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w + y * dilation_h * psc(w); + + for (int x = 0; x < kernel_w; x++) + { + int v0 = i8buffer_ld1(bottom_blob_int8_data_1, v_offset + x * dilation_w); + int k0p = i8buffer_sm4(weight_data, w_offset + x); + ivec4 k0 = unpackInt4x8(k0p); + + sum += v0 * k0; + } + + w_offset += kernel_w; + } + } + } + + vec4 sumfp32 = vec4(sum) * descales_data[gz_aligned]; + + if (bias_term == 1) + { + sumfp32 += bias_data[gz_aligned]; + } + + sumfp32 = vec4(activation_afpvec4(afpvec4(sumfp32), activation_type, activation_param_0, activation_param_1)); + + int outch0 = gz * 4; + + if (use_int8_requantize == 1) + { + vec4 top_scale = vec4(buffer_ld4(top_scales_data, gz_aligned)); + sumfp32 *= top_scale; + ivec4 v = float2int8vec4(sumfp32); + + if (out_elempack == 4) + { + int gi = top_z * psc(outcstep) + gy * psc(outw) + gx; + i8buffer_st4(top_blob_int8_data_4, gi, v); + } + else // out_elempack == 1 + { + int channel_step = psc(outcstep) / 4; + int base_ch = gg * num_output_g + outch0; + int gi = base_ch * channel_step + gy * psc(outw) + gx; + + i8buffer_st1(top_blob_int8_data_1, gi, v.r); + if (outch0 + 1 < num_output_g) i8buffer_st1(top_blob_int8_data_1, gi + channel_step, v.g); + if (outch0 + 2 < num_output_g) i8buffer_st1(top_blob_int8_data_1, gi + channel_step * 2, v.b); + if (outch0 + 3 < num_output_g) i8buffer_st1(top_blob_int8_data_1, gi + channel_step * 3, v.a); + } + } + else + { + if (out_elempack == 4) + { + int gi = top_z * psc(outcstep) + gy * psc(outw) + gx; + buffer_st4(top_blob_data_4, gi, afpvec4(sumfp32)); + } + else // out_elempack == 1 + { + int channel_step = psc(outcstep) / 4; + int base_ch = gg * num_output_g + outch0; + int gi = base_ch * channel_step + gy * psc(outw) + gx; + + buffer_st1(top_blob_data_1, gi, afp(sumfp32.r)); + if (outch0 + 1 < num_output_g) buffer_st1(top_blob_data_1, gi + channel_step, afp(sumfp32.g)); + if (outch0 + 2 < num_output_g) buffer_st1(top_blob_data_1, gi + channel_step * 2, afp(sumfp32.b)); + if (outch0 + 3 < num_output_g) buffer_st1(top_blob_data_1, gi + channel_step * 3, afp(sumfp32.a)); + } + } +} diff --git a/src/layer/vulkan/shader/convolutiondepthwise_int8.comp b/src/layer/vulkan/shader/convolutiondepthwise_int8.comp new file mode 100644 index 000000000000..5b541f6f1888 --- /dev/null +++ b/src/layer/vulkan/shader/convolutiondepthwise_int8.comp @@ -0,0 +1,138 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated +#extension GL_EXT_integer_dot_product : require +#endif + +#extension GL_GOOGLE_include_directive : enable +#include "vulkan_activation.comp" + +layout(constant_id = 0) const int kernel_w = 1; +layout(constant_id = 1) const int kernel_h = 1; +layout(constant_id = 2) const int dilation_w = 1; +layout(constant_id = 3) const int dilation_h = 1; +layout(constant_id = 4) const int stride_w = 1; +layout(constant_id = 5) const int stride_h = 1; +layout(constant_id = 6) const int bias_term = 0; +layout(constant_id = 7) const int group = 1; +layout(constant_id = 8) const int activation_type = 0; +layout(constant_id = 9) const float activation_param_0 = 0; +layout(constant_id = 10) const float activation_param_1 = 0; +layout(constant_id = 11) const int use_int8_requantize = 0; + +#define shape_constant_id_offset 12 +layout(constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout(constant_id = shape_constant_id_offset + 1) const int w = 0; +layout(constant_id = shape_constant_id_offset + 2) const int h = 0; +layout(constant_id = shape_constant_id_offset + 3) const int c = 0; +layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout(constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout(constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout(constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout(binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_int8_data[]; }; +layout(binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +layout(binding = 2) readonly buffer weight_blob { sint8vec4 weight_data[]; }; +layout(binding = 3) readonly buffer bias_blob { float bias_data[]; }; +layout(binding = 4) readonly buffer descales_blob { float descales_data[]; }; +layout(binding = 5) readonly buffer top_scales_blob { sfp top_scales_data[]; }; +layout(binding = 6) writeonly buffer top_blob_int8 { sint8 top_blob_int8_data[]; }; + +layout(push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + const int maxk = kernel_w * kernel_h; + const int maxk4 = (maxk + 3) / 4 * 4; + const int maxk4d4 = maxk4 / 4; + + int sum = 0; + + int k = 0; + for (; k + 3 < maxk; k += 4) + { + const ivec4 k4 = k + ivec4(0, 1, 2, 3); + const ivec4 ky4 = k4 / kernel_w; + const ivec4 kx4 = k4 - ky4 * kernel_w; + const ivec4 x4 = gx * stride_w + kx4 * dilation_w; + const ivec4 y4 = gy * stride_h + ky4 * dilation_h; + + const int v0 = i8buffer_ld1(bottom_blob_int8_data, gz * psc(cstep) + y4.r * psc(w) + x4.r); + const int v1 = i8buffer_ld1(bottom_blob_int8_data, gz * psc(cstep) + y4.g * psc(w) + x4.g); + const int v2 = i8buffer_ld1(bottom_blob_int8_data, gz * psc(cstep) + y4.b * psc(w) + x4.b); + const int v3 = i8buffer_ld1(bottom_blob_int8_data, gz * psc(cstep) + y4.a * psc(w) + x4.a); + +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated + const int vp = int((uint(v0) & 0xffu) | ((uint(v1) & 0xffu) << 8) | ((uint(v2) & 0xffu) << 16) | ((uint(v3) & 0xffu) << 24)); + const int kvp = i8buffer_sm4(weight_data, gz * maxk4d4 + k / 4); + sum += dotPacked4x8EXT(vp, kvp); +#else + const ivec4 v = ivec4(v0, v1, v2, v3); + const ivec4 kv = i8buffer_ld4(weight_data, gz * maxk4d4 + k / 4); + sum += v.r * kv.r + v.g * kv.g + v.b * kv.b + v.a * kv.a; +#endif + } + + for (; k < maxk; k++) + { + const int ky = k / kernel_w; + const int kx = k - ky * kernel_w; + const int x = gx * stride_w + kx * dilation_w; + const int y = gy * stride_h + ky * dilation_h; + + const int v = i8buffer_ld1(bottom_blob_int8_data, gz * psc(cstep) + y * psc(w) + x); + const int kvp = i8buffer_sm4(weight_data, gz * maxk4d4 + k / 4); + const int ktail = k - k / 4 * 4; + const int kv = (kvp << (24 - ktail * 8)) >> 24; + + sum += v * kv; + } + + float sumfp32 = float(sum) * descales_data[gz]; + + if (bias_term == 1) + { + sumfp32 += bias_data[gz]; + } + + sumfp32 = float(activation_afp(afp(sumfp32), activation_type, activation_param_0, activation_param_1)); + + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + if (use_int8_requantize == 1) + { + sumfp32 *= buffer_ld1(top_scales_data, gz); + const int v = float2int8(sumfp32); + i8buffer_st1(top_blob_int8_data, gi, v); + } + else + { + buffer_st1(top_blob_data, gi, afp(sumfp32)); + } +} diff --git a/src/layer/vulkan/shader/convolutiondepthwise_pack4_int8.comp b/src/layer/vulkan/shader/convolutiondepthwise_pack4_int8.comp new file mode 100644 index 000000000000..9c9ddfde1367 --- /dev/null +++ b/src/layer/vulkan/shader/convolutiondepthwise_pack4_int8.comp @@ -0,0 +1,161 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated +#extension GL_EXT_integer_dot_product : require +#endif + +#extension GL_GOOGLE_include_directive : enable +#include "vulkan_activation.comp" + +layout(constant_id = 0) const int kernel_w = 1; +layout(constant_id = 1) const int kernel_h = 1; +layout(constant_id = 2) const int dilation_w = 1; +layout(constant_id = 3) const int dilation_h = 1; +layout(constant_id = 4) const int stride_w = 1; +layout(constant_id = 5) const int stride_h = 1; +layout(constant_id = 6) const int bias_term = 0; +layout(constant_id = 7) const int group = 1; +layout(constant_id = 8) const int activation_type = 0; +layout(constant_id = 9) const float activation_param_0 = 0; +layout(constant_id = 10) const float activation_param_1 = 0; +layout(constant_id = 11) const int use_int8_requantize = 0; + +#define shape_constant_id_offset 12 +layout(constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout(constant_id = shape_constant_id_offset + 1) const int w = 0; +layout(constant_id = shape_constant_id_offset + 2) const int h = 0; +layout(constant_id = shape_constant_id_offset + 3) const int c = 0; +layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout(constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout(constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout(constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout(binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_int8_data[]; }; +layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout(binding = 2) readonly buffer weight_blob { ivec4 weight_data[]; }; +layout(binding = 3) readonly buffer bias_blob { vec4 bias_data[]; }; +layout(binding = 4) readonly buffer descales_blob { vec4 descales_data[]; }; +layout(binding = 5) readonly buffer top_scales_blob { sfpvec4 top_scales_data[]; }; +layout(binding = 6) writeonly buffer top_blob_int8 { sint8vec4 top_blob_int8_data[]; }; + +layout(push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + const int maxk = kernel_w * kernel_h; + const int maxk4 = (maxk + 3) / 4 * 4; + + ivec4 sum = ivec4(0); + + int k = 0; + for (; k + 3 < maxk; k += 4) + { + const ivec4 k4 = k + ivec4(0, 1, 2, 3); + const ivec4 ky4 = k4 / kernel_w; + const ivec4 kx4 = k4 - ky4 * kernel_w; + const ivec4 x4 = gx * stride_w + kx4 * dilation_w; + const ivec4 y4 = gy * stride_h + ky4 * dilation_h; + + const int v0p = i8buffer_sm4(bottom_blob_int8_data, gz * psc(cstep) + y4.r * psc(w) + x4.r); + const int v1p = i8buffer_sm4(bottom_blob_int8_data, gz * psc(cstep) + y4.g * psc(w) + x4.g); + const int v2p = i8buffer_sm4(bottom_blob_int8_data, gz * psc(cstep) + y4.b * psc(w) + x4.b); + const int v3p = i8buffer_sm4(bottom_blob_int8_data, gz * psc(cstep) + y4.a * psc(w) + x4.a); + + const ivec4 kp = weight_data[gz * maxk4 / 4 + k / 4]; + +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated + const uint v0u = uint(v0p); + const uint v1u = uint(v1p); + const uint v2u = uint(v2p); + const uint v3u = uint(v3p); + + const int vp0 = int((v0u & 0x000000ffu) | ((v1u & 0x000000ffu) << 8) | ((v2u & 0x000000ffu) << 16) | ((v3u & 0x000000ffu) << 24)); + const int vp1 = int(((v0u & 0x0000ff00u) >> 8) | (v1u & 0x0000ff00u) | ((v2u & 0x0000ff00u) << 8) | ((v3u & 0x0000ff00u) << 16)); + const int vp2 = int(((v0u & 0x00ff0000u) >> 16) | ((v1u & 0x00ff0000u) >> 8) | (v2u & 0x00ff0000u) | ((v3u & 0x00ff0000u) << 8)); + const int vp3 = int(((v0u & 0xff000000u) >> 24) | ((v1u & 0xff000000u) >> 16) | ((v2u & 0xff000000u) >> 8) | (v3u & 0xff000000u)); + + sum.r += dotPacked4x8EXT(vp0, kp.r); + sum.g += dotPacked4x8EXT(vp1, kp.g); + sum.b += dotPacked4x8EXT(vp2, kp.b); + sum.a += dotPacked4x8EXT(vp3, kp.a); +#else + const ivec4 v0 = unpackInt4x8(v0p); + const ivec4 v1 = unpackInt4x8(v1p); + const ivec4 v2 = unpackInt4x8(v2p); + const ivec4 v3 = unpackInt4x8(v3p); + + const ivec4 k0 = unpackInt4x8(kp.r); + const ivec4 k1 = unpackInt4x8(kp.g); + const ivec4 k2 = unpackInt4x8(kp.b); + const ivec4 k3 = unpackInt4x8(kp.a); + + sum.r += v0.r * k0.r + v1.r * k0.g + v2.r * k0.b + v3.r * k0.a; + sum.g += v0.g * k1.r + v1.g * k1.g + v2.g * k1.b + v3.g * k1.a; + sum.b += v0.b * k2.r + v1.b * k2.g + v2.b * k2.b + v3.b * k2.a; + sum.a += v0.a * k3.r + v1.a * k3.g + v2.a * k3.b + v3.a * k3.a; +#endif + } + + for (; k < maxk; k++) + { + const int ky = k / kernel_w; + const int kx = k - ky * kernel_w; + const int x = gx * stride_w + kx * dilation_w; + const int y = gy * stride_h + ky * dilation_h; + + const ivec4 v = i8buffer_ld4(bottom_blob_int8_data, gz * psc(cstep) + y * psc(w) + x); + const ivec4 kp = weight_data[gz * maxk4 / 4 + k / 4]; + const int ktail = k - k / 4 * 4; + const ivec4 kv = (kp << ivec4(24 - ktail * 8)) >> ivec4(24); + + sum += v * kv; + } + + vec4 sumfp32 = vec4(sum) * descales_data[gz]; + + if (bias_term == 1) + { + sumfp32 += bias_data[gz]; + } + + sumfp32 = vec4(activation_afpvec4(afpvec4(sumfp32), activation_type, activation_param_0, activation_param_1)); + + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + if (use_int8_requantize == 1) + { + sumfp32 *= vec4(buffer_ld4(top_scales_data, gz)); + const ivec4 v = float2int8vec4(sumfp32); + i8buffer_st4(top_blob_int8_data, gi, v); + } + else + { + buffer_st4(top_blob_data, gi, afpvec4(sumfp32)); + } +} diff --git a/src/layer/vulkan/shader/flatten_int8.comp b/src/layer/vulkan/shader/flatten_int8.comp new file mode 100644 index 000000000000..110b84fb1452 --- /dev/null +++ b/src/layer/vulkan/shader/flatten_int8.comp @@ -0,0 +1,55 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#define shape_constant_id_offset 0 +layout(constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout(constant_id = shape_constant_id_offset + 1) const int w = 0; +layout(constant_id = shape_constant_id_offset + 2) const int h = 0; +layout(constant_id = shape_constant_id_offset + 3) const int c = 0; +layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout(constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout(constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout(constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout(binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_data[]; }; +layout(binding = 1) writeonly buffer top_blob { sint8 top_blob_data[]; }; + +layout(push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= 1 || gz >= 1) + return; + + int size = psc(w) * psc(h); + + int z = gx / size; + int y = gx % size / psc(w); + int x = gx % size % psc(w); + + int v_offset = z * psc(cstep) + y * psc(w) + x; + + i8buffer_st1(top_blob_data, gx, i8buffer_ld1(bottom_blob_data, v_offset)); +} diff --git a/src/layer/vulkan/shader/flatten_pack1to4_int8.comp b/src/layer/vulkan/shader/flatten_pack1to4_int8.comp new file mode 100644 index 000000000000..86fabd3f5399 --- /dev/null +++ b/src/layer/vulkan/shader/flatten_pack1to4_int8.comp @@ -0,0 +1,74 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#define shape_constant_id_offset 0 +layout(constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout(constant_id = shape_constant_id_offset + 1) const int w = 0; +layout(constant_id = shape_constant_id_offset + 2) const int h = 0; +layout(constant_id = shape_constant_id_offset + 3) const int c = 0; +layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout(constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout(constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout(constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout(binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_data[]; }; +layout(binding = 1) writeonly buffer top_blob { sint8vec4 top_blob_data[]; }; + +layout(push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= 1 || gz >= 1) + return; + + ivec4 i4 = gx * 4 + ivec4(0, 1, 2, 3); + + ivec4 v_offset; + + if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + + v_offset = y4 * psc(w) + x4; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + + v_offset = z4 * psc(cstep) + y4 * psc(w) + x4; + } + + ivec4 v = ivec4(i8buffer_ld1(bottom_blob_data, v_offset.r), + i8buffer_ld1(bottom_blob_data, v_offset.g), + i8buffer_ld1(bottom_blob_data, v_offset.b), + i8buffer_ld1(bottom_blob_data, v_offset.a)); + + i8buffer_st4(top_blob_data, gx, v); +} diff --git a/src/layer/vulkan/shader/flatten_pack4_int8.comp b/src/layer/vulkan/shader/flatten_pack4_int8.comp new file mode 100644 index 000000000000..88b02ac01be1 --- /dev/null +++ b/src/layer/vulkan/shader/flatten_pack4_int8.comp @@ -0,0 +1,69 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#define shape_constant_id_offset 0 +layout(constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout(constant_id = shape_constant_id_offset + 1) const int w = 0; +layout(constant_id = shape_constant_id_offset + 2) const int h = 0; +layout(constant_id = shape_constant_id_offset + 3) const int c = 0; +layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout(constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout(constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout(constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout(binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_data[]; }; +layout(binding = 1) writeonly buffer top_blob { sint8vec4 top_blob_data[]; }; + +layout(push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= 1 || gz >= 1) + return; + + ivec4 i4 = gx * 4 + ivec4(0, 1, 2, 3); + + ivec4 v_offset; + + if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + + v_offset = ((y4 / 4) * psc(w) + x4) * 4 + y4 % 4; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + + v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 4 + z4 % 4; + } + + i8buffer_cp1to4(top_blob_data, gx, bottom_blob_data, v_offset); +} diff --git a/src/layer/vulkan/shader/gemm_int8.comp b/src/layer/vulkan/shader/gemm_int8.comp new file mode 100644 index 000000000000..34d552991367 --- /dev/null +++ b/src/layer/vulkan/shader/gemm_int8.comp @@ -0,0 +1,462 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated +#extension GL_EXT_integer_dot_product : require +#endif + +#define LOCAL_MEMORY_UNROLL_INCH 8 + +layout(constant_id = 0) const float alpha = 1.f; +layout(constant_id = 1) const float beta = 1.f; +layout(constant_id = 2) const int constantC = 0; +layout(constant_id = 3) const int constant_broadcast_type_C = 0; +layout(constant_id = 4) const int output_transpose = 0; + +layout(binding = 0) writeonly buffer top_blob { sfp top_blob_data[]; }; +layout(binding = 1) readonly buffer A_int8_blob { sint8 A_int8_data[]; }; +layout(binding = 2) readonly buffer B_int8_blob { sint8 B_int8_data[]; }; +layout(binding = 3) readonly buffer C_blob { sfp C_blob_data[]; }; +layout(binding = 4) readonly buffer A_descales_blob { float A_descales_data[]; }; +layout(binding = 5) readonly buffer B_descale_blob { float B_descale_data[]; }; + +layout(push_constant) uniform parameter +{ + int M; + int N; + int K; + int broadcast_type_C; + int outhstep; +} p; + +#if NCNN_shader_local_memory +// avoid bank conflict +#define PAD 1 +shared int tmp_a[8][LOCAL_MEMORY_UNROLL_INCH + PAD]; +shared int tmp_b[8][LOCAL_MEMORY_UNROLL_INCH + PAD]; +#endif + +void main() +{ + const uint gx = gl_GlobalInvocationID.x; + const uint gy = gl_GlobalInvocationID.y; + const uint gz = gl_GlobalInvocationID.z; + +#if !NCNN_shader_local_memory + if (gx * 4 >= p.N || gy * 4 >= p.M || gz >= 1) + return; +#else + if (gz >= 1) + return; +#endif + + ivec4 sum0 = ivec4(0); + ivec4 sum1 = ivec4(0); + ivec4 sum2 = ivec4(0); + ivec4 sum3 = ivec4(0); + +#if NCNN_shader_local_memory + const uint lx = gl_LocalInvocationID.x; + const uint ly = gl_LocalInvocationID.y; + + for (int k = 0; k < p.K; k += LOCAL_MEMORY_UNROLL_INCH) + { + { + ivec4 a = ivec4(0); + const int ak = k + int(lx / 4) * 4; + const uint ay = gy * 4 + lx % 4; + if (ay < p.M) + { + const int ai = int(ay) * p.K + ak; + if (ak + 0 < p.K) a.r = i8buffer_ld1(A_int8_data, ai + 0); + if (ak + 1 < p.K) a.g = i8buffer_ld1(A_int8_data, ai + 1); + if (ak + 2 < p.K) a.b = i8buffer_ld1(A_int8_data, ai + 2); + if (ak + 3 < p.K) a.a = i8buffer_ld1(A_int8_data, ai + 3); + } + + tmp_a[ly][lx] = packInt4x8(a); + + ivec4 b = ivec4(0); + const int bk = k + int(ly / 4) * 4; + const uint bx = gx * 4 + ly % 4; + if (bx < p.N) + { + const int bi = int(bx) * p.K + bk; + if (bk + 0 < p.K) b.r = i8buffer_ld1(B_int8_data, bi + 0); + if (bk + 1 < p.K) b.g = i8buffer_ld1(B_int8_data, bi + 1); + if (bk + 2 < p.K) b.b = i8buffer_ld1(B_int8_data, bi + 2); + if (bk + 3 < p.K) b.a = i8buffer_ld1(B_int8_data, bi + 3); + } + + tmp_b[lx][ly] = packInt4x8(b); + } + + barrier(); + + for (int k4 = 0; k4 < LOCAL_MEMORY_UNROLL_INCH / 4; k4++) + { + const int kk = k4 * 4; + +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated + const int a0 = tmp_a[ly][kk + 0]; + const int a1 = tmp_a[ly][kk + 1]; + const int a2 = tmp_a[ly][kk + 2]; + const int a3 = tmp_a[ly][kk + 3]; + + const int b0 = tmp_b[lx][kk + 0]; + const int b1 = tmp_b[lx][kk + 1]; + const int b2 = tmp_b[lx][kk + 2]; + const int b3 = tmp_b[lx][kk + 3]; + + sum0.r += dotPacked4x8EXT(a0, b0); + sum0.g += dotPacked4x8EXT(a0, b1); + sum0.b += dotPacked4x8EXT(a0, b2); + sum0.a += dotPacked4x8EXT(a0, b3); + sum1.r += dotPacked4x8EXT(a1, b0); + sum1.g += dotPacked4x8EXT(a1, b1); + sum1.b += dotPacked4x8EXT(a1, b2); + sum1.a += dotPacked4x8EXT(a1, b3); + sum2.r += dotPacked4x8EXT(a2, b0); + sum2.g += dotPacked4x8EXT(a2, b1); + sum2.b += dotPacked4x8EXT(a2, b2); + sum2.a += dotPacked4x8EXT(a2, b3); + sum3.r += dotPacked4x8EXT(a3, b0); + sum3.g += dotPacked4x8EXT(a3, b1); + sum3.b += dotPacked4x8EXT(a3, b2); + sum3.a += dotPacked4x8EXT(a3, b3); +#else + const ivec4 a0 = unpackInt4x8(tmp_a[ly][kk + 0]); + const ivec4 a1 = unpackInt4x8(tmp_a[ly][kk + 1]); + const ivec4 a2 = unpackInt4x8(tmp_a[ly][kk + 2]); + const ivec4 a3 = unpackInt4x8(tmp_a[ly][kk + 3]); + + const ivec4 b0 = unpackInt4x8(tmp_b[lx][kk + 0]); + const ivec4 b1 = unpackInt4x8(tmp_b[lx][kk + 1]); + const ivec4 b2 = unpackInt4x8(tmp_b[lx][kk + 2]); + const ivec4 b3 = unpackInt4x8(tmp_b[lx][kk + 3]); + + sum0.r += a0.r * b0.r + a0.g * b0.g + a0.b * b0.b + a0.a * b0.a; + sum0.g += a0.r * b1.r + a0.g * b1.g + a0.b * b1.b + a0.a * b1.a; + sum0.b += a0.r * b2.r + a0.g * b2.g + a0.b * b2.b + a0.a * b2.a; + sum0.a += a0.r * b3.r + a0.g * b3.g + a0.b * b3.b + a0.a * b3.a; + sum1.r += a1.r * b0.r + a1.g * b0.g + a1.b * b0.b + a1.a * b0.a; + sum1.g += a1.r * b1.r + a1.g * b1.g + a1.b * b1.b + a1.a * b1.a; + sum1.b += a1.r * b2.r + a1.g * b2.g + a1.b * b2.b + a1.a * b2.a; + sum1.a += a1.r * b3.r + a1.g * b3.g + a1.b * b3.b + a1.a * b3.a; + sum2.r += a2.r * b0.r + a2.g * b0.g + a2.b * b0.b + a2.a * b0.a; + sum2.g += a2.r * b1.r + a2.g * b1.g + a2.b * b1.b + a2.a * b1.a; + sum2.b += a2.r * b2.r + a2.g * b2.g + a2.b * b2.b + a2.a * b2.a; + sum2.a += a2.r * b3.r + a2.g * b3.g + a2.b * b3.b + a2.a * b3.a; + sum3.r += a3.r * b0.r + a3.g * b0.g + a3.b * b0.b + a3.a * b0.a; + sum3.g += a3.r * b1.r + a3.g * b1.g + a3.b * b1.b + a3.a * b1.a; + sum3.b += a3.r * b2.r + a3.g * b2.g + a3.b * b2.b + a3.a * b2.a; + sum3.a += a3.r * b3.r + a3.g * b3.g + a3.b * b3.b + a3.a * b3.a; +#endif + } + + barrier(); + } + + if (gx * 4 >= p.N || gy * 4 >= p.M) + return; +#else + for (int k = 0; k < p.K; k += 4) + { + const uvec4 gy4 = gy * 4 + uvec4(0, 1, 2, 3); + const uvec4 gx4 = gx * 4 + uvec4(0, 1, 2, 3); + + ivec4 a0 = ivec4(0); + ivec4 a1 = ivec4(0); + ivec4 a2 = ivec4(0); + ivec4 a3 = ivec4(0); + + if (gy4.r < p.M) + { + const int ai = int(gy4.r) * p.K + k; + a0.r = i8buffer_ld1(A_int8_data, ai + 0); + if (k + 1 < p.K) a0.g = i8buffer_ld1(A_int8_data, ai + 1); + if (k + 2 < p.K) a0.b = i8buffer_ld1(A_int8_data, ai + 2); + if (k + 3 < p.K) a0.a = i8buffer_ld1(A_int8_data, ai + 3); + } + if (gy4.g < p.M) + { + const int ai = int(gy4.g) * p.K + k; + a1.r = i8buffer_ld1(A_int8_data, ai + 0); + if (k + 1 < p.K) a1.g = i8buffer_ld1(A_int8_data, ai + 1); + if (k + 2 < p.K) a1.b = i8buffer_ld1(A_int8_data, ai + 2); + if (k + 3 < p.K) a1.a = i8buffer_ld1(A_int8_data, ai + 3); + } + if (gy4.b < p.M) + { + const int ai = int(gy4.b) * p.K + k; + a2.r = i8buffer_ld1(A_int8_data, ai + 0); + if (k + 1 < p.K) a2.g = i8buffer_ld1(A_int8_data, ai + 1); + if (k + 2 < p.K) a2.b = i8buffer_ld1(A_int8_data, ai + 2); + if (k + 3 < p.K) a2.a = i8buffer_ld1(A_int8_data, ai + 3); + } + if (gy4.a < p.M) + { + const int ai = int(gy4.a) * p.K + k; + a3.r = i8buffer_ld1(A_int8_data, ai + 0); + if (k + 1 < p.K) a3.g = i8buffer_ld1(A_int8_data, ai + 1); + if (k + 2 < p.K) a3.b = i8buffer_ld1(A_int8_data, ai + 2); + if (k + 3 < p.K) a3.a = i8buffer_ld1(A_int8_data, ai + 3); + } + + ivec4 b0 = ivec4(0); + ivec4 b1 = ivec4(0); + ivec4 b2 = ivec4(0); + ivec4 b3 = ivec4(0); + + if (gx4.r < p.N) + { + const int bi = int(gx4.r) * p.K + k; + b0.r = i8buffer_ld1(B_int8_data, bi + 0); + if (k + 1 < p.K) b0.g = i8buffer_ld1(B_int8_data, bi + 1); + if (k + 2 < p.K) b0.b = i8buffer_ld1(B_int8_data, bi + 2); + if (k + 3 < p.K) b0.a = i8buffer_ld1(B_int8_data, bi + 3); + } + if (gx4.g < p.N) + { + const int bi = int(gx4.g) * p.K + k; + b1.r = i8buffer_ld1(B_int8_data, bi + 0); + if (k + 1 < p.K) b1.g = i8buffer_ld1(B_int8_data, bi + 1); + if (k + 2 < p.K) b1.b = i8buffer_ld1(B_int8_data, bi + 2); + if (k + 3 < p.K) b1.a = i8buffer_ld1(B_int8_data, bi + 3); + } + if (gx4.b < p.N) + { + const int bi = int(gx4.b) * p.K + k; + b2.r = i8buffer_ld1(B_int8_data, bi + 0); + if (k + 1 < p.K) b2.g = i8buffer_ld1(B_int8_data, bi + 1); + if (k + 2 < p.K) b2.b = i8buffer_ld1(B_int8_data, bi + 2); + if (k + 3 < p.K) b2.a = i8buffer_ld1(B_int8_data, bi + 3); + } + if (gx4.a < p.N) + { + const int bi = int(gx4.a) * p.K + k; + b3.r = i8buffer_ld1(B_int8_data, bi + 0); + if (k + 1 < p.K) b3.g = i8buffer_ld1(B_int8_data, bi + 1); + if (k + 2 < p.K) b3.b = i8buffer_ld1(B_int8_data, bi + 2); + if (k + 3 < p.K) b3.a = i8buffer_ld1(B_int8_data, bi + 3); + } + +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated + const int a0_packed = packInt4x8(a0); + const int a1_packed = packInt4x8(a1); + const int a2_packed = packInt4x8(a2); + const int a3_packed = packInt4x8(a3); + + const int b0_packed = packInt4x8(b0); + const int b1_packed = packInt4x8(b1); + const int b2_packed = packInt4x8(b2); + const int b3_packed = packInt4x8(b3); + + sum0.r += dotPacked4x8EXT(a0_packed, b0_packed); + sum0.g += dotPacked4x8EXT(a0_packed, b1_packed); + sum0.b += dotPacked4x8EXT(a0_packed, b2_packed); + sum0.a += dotPacked4x8EXT(a0_packed, b3_packed); + sum1.r += dotPacked4x8EXT(a1_packed, b0_packed); + sum1.g += dotPacked4x8EXT(a1_packed, b1_packed); + sum1.b += dotPacked4x8EXT(a1_packed, b2_packed); + sum1.a += dotPacked4x8EXT(a1_packed, b3_packed); + sum2.r += dotPacked4x8EXT(a2_packed, b0_packed); + sum2.g += dotPacked4x8EXT(a2_packed, b1_packed); + sum2.b += dotPacked4x8EXT(a2_packed, b2_packed); + sum2.a += dotPacked4x8EXT(a2_packed, b3_packed); + sum3.r += dotPacked4x8EXT(a3_packed, b0_packed); + sum3.g += dotPacked4x8EXT(a3_packed, b1_packed); + sum3.b += dotPacked4x8EXT(a3_packed, b2_packed); + sum3.a += dotPacked4x8EXT(a3_packed, b3_packed); +#else + sum0.r += a0.r * b0.r + a0.g * b0.g + a0.b * b0.b + a0.a * b0.a; + sum0.g += a0.r * b1.r + a0.g * b1.g + a0.b * b1.b + a0.a * b1.a; + sum0.b += a0.r * b2.r + a0.g * b2.g + a0.b * b2.b + a0.a * b2.a; + sum0.a += a0.r * b3.r + a0.g * b3.g + a0.b * b3.b + a0.a * b3.a; + sum1.r += a1.r * b0.r + a1.g * b0.g + a1.b * b0.b + a1.a * b0.a; + sum1.g += a1.r * b1.r + a1.g * b1.g + a1.b * b1.b + a1.a * b1.a; + sum1.b += a1.r * b2.r + a1.g * b2.g + a1.b * b2.b + a1.a * b2.a; + sum1.a += a1.r * b3.r + a1.g * b3.g + a1.b * b3.b + a1.a * b3.a; + sum2.r += a2.r * b0.r + a2.g * b0.g + a2.b * b0.b + a2.a * b0.a; + sum2.g += a2.r * b1.r + a2.g * b1.g + a2.b * b1.b + a2.a * b1.a; + sum2.b += a2.r * b2.r + a2.g * b2.g + a2.b * b2.b + a2.a * b2.a; + sum2.a += a2.r * b3.r + a2.g * b3.g + a2.b * b3.b + a2.a * b3.a; + sum3.r += a3.r * b0.r + a3.g * b0.g + a3.b * b0.b + a3.a * b0.a; + sum3.g += a3.r * b1.r + a3.g * b1.g + a3.b * b1.b + a3.a * b1.a; + sum3.b += a3.r * b2.r + a3.g * b2.g + a3.b * b2.b + a3.a * b2.a; + sum3.a += a3.r * b3.r + a3.g * b3.g + a3.b * b3.b + a3.a * b3.a; +#endif + } +#endif + + const uvec4 gx4 = gx * 4 + uvec4(0, 1, 2, 3); + const uvec4 gy4 = gy * 4 + uvec4(0, 1, 2, 3); + + const float B_descale = B_descale_data[0]; + + float descale0 = 0.f; + float descale1 = 0.f; + float descale2 = 0.f; + float descale3 = 0.f; + + if (gy4.r < p.M) + { + descale0 = A_descales_data[gy4.r] * B_descale; + } + if (gy4.g < p.M) + { + descale1 = A_descales_data[gy4.g] * B_descale; + } + if (gy4.b < p.M) + { + descale2 = A_descales_data[gy4.b] * B_descale; + } + if (gy4.a < p.M) + { + descale3 = A_descales_data[gy4.a] * B_descale; + } + + vec4 sumfp0 = vec4(sum0) * descale0; + vec4 sumfp1 = vec4(sum1) * descale1; + vec4 sumfp2 = vec4(sum2) * descale2; + vec4 sumfp3 = vec4(sum3) * descale3; + + const int broadcast_type_C = constantC == 1 ? constant_broadcast_type_C : p.broadcast_type_C; + if (broadcast_type_C != -1) + { + if (broadcast_type_C == 0) + { + const float c = float(buffer_ld1(C_blob_data, 0)) * beta; + sumfp0 += c; + sumfp1 += c; + sumfp2 += c; + sumfp3 += c; + } + if (broadcast_type_C == 1 || broadcast_type_C == 2) + { + if (gy4.r < p.M) sumfp0 += float(buffer_ld1(C_blob_data, gy4.r)) * beta; + if (gy4.g < p.M) sumfp1 += float(buffer_ld1(C_blob_data, gy4.g)) * beta; + if (gy4.b < p.M) sumfp2 += float(buffer_ld1(C_blob_data, gy4.b)) * beta; + if (gy4.a < p.M) sumfp3 += float(buffer_ld1(C_blob_data, gy4.a)) * beta; + } + if (broadcast_type_C == 3) + { + if (gy4.r < p.M) + { + const uint ci = gy4.r * uint(p.N) + gx * 4; + sumfp0.r += float(buffer_ld1(C_blob_data, ci + 0)) * beta; + if (gx4.g < p.N) sumfp0.g += float(buffer_ld1(C_blob_data, ci + 1)) * beta; + if (gx4.b < p.N) sumfp0.b += float(buffer_ld1(C_blob_data, ci + 2)) * beta; + if (gx4.a < p.N) sumfp0.a += float(buffer_ld1(C_blob_data, ci + 3)) * beta; + } + if (gy4.g < p.M) + { + const uint ci = gy4.g * uint(p.N) + gx * 4; + sumfp1.r += float(buffer_ld1(C_blob_data, ci + 0)) * beta; + if (gx4.g < p.N) sumfp1.g += float(buffer_ld1(C_blob_data, ci + 1)) * beta; + if (gx4.b < p.N) sumfp1.b += float(buffer_ld1(C_blob_data, ci + 2)) * beta; + if (gx4.a < p.N) sumfp1.a += float(buffer_ld1(C_blob_data, ci + 3)) * beta; + } + if (gy4.b < p.M) + { + const uint ci = gy4.b * uint(p.N) + gx * 4; + sumfp2.r += float(buffer_ld1(C_blob_data, ci + 0)) * beta; + if (gx4.g < p.N) sumfp2.g += float(buffer_ld1(C_blob_data, ci + 1)) * beta; + if (gx4.b < p.N) sumfp2.b += float(buffer_ld1(C_blob_data, ci + 2)) * beta; + if (gx4.a < p.N) sumfp2.a += float(buffer_ld1(C_blob_data, ci + 3)) * beta; + } + if (gy4.a < p.M) + { + const uint ci = gy4.a * uint(p.N) + gx * 4; + sumfp3.r += float(buffer_ld1(C_blob_data, ci + 0)) * beta; + if (gx4.g < p.N) sumfp3.g += float(buffer_ld1(C_blob_data, ci + 1)) * beta; + if (gx4.b < p.N) sumfp3.b += float(buffer_ld1(C_blob_data, ci + 2)) * beta; + if (gx4.a < p.N) sumfp3.a += float(buffer_ld1(C_blob_data, ci + 3)) * beta; + } + } + if (broadcast_type_C == 4) + { + vec4 c = vec4(0.f); + c.r = float(buffer_ld1(C_blob_data, gx4.r)); + if (gx4.g < p.N) c.g = float(buffer_ld1(C_blob_data, gx4.g)); + if (gx4.b < p.N) c.b = float(buffer_ld1(C_blob_data, gx4.b)); + if (gx4.a < p.N) c.a = float(buffer_ld1(C_blob_data, gx4.a)); + + sumfp0 += c * beta; + sumfp1 += c * beta; + sumfp2 += c * beta; + sumfp3 += c * beta; + } + } + + if (alpha != 1.f) + { + sumfp0 *= alpha; + sumfp1 *= alpha; + sumfp2 *= alpha; + sumfp3 *= alpha; + } + + if (output_transpose == 1) + { + const uvec4 gi4 = gx4 * uint(p.outhstep) + gy * 4; + + buffer_st1(top_blob_data, gi4.r, sumfp0.r); + if (gy4.g < p.M) buffer_st1(top_blob_data, gi4.r + 1, sumfp1.r); + if (gy4.b < p.M) buffer_st1(top_blob_data, gi4.r + 2, sumfp2.r); + if (gy4.a < p.M) buffer_st1(top_blob_data, gi4.r + 3, sumfp3.r); + if (gx4.g < p.N) + { + buffer_st1(top_blob_data, gi4.g, sumfp0.g); + if (gy4.g < p.M) buffer_st1(top_blob_data, gi4.g + 1, sumfp1.g); + if (gy4.b < p.M) buffer_st1(top_blob_data, gi4.g + 2, sumfp2.g); + if (gy4.a < p.M) buffer_st1(top_blob_data, gi4.g + 3, sumfp3.g); + } + if (gx4.b < p.N) + { + buffer_st1(top_blob_data, gi4.b, sumfp0.b); + if (gy4.g < p.M) buffer_st1(top_blob_data, gi4.b + 1, sumfp1.b); + if (gy4.b < p.M) buffer_st1(top_blob_data, gi4.b + 2, sumfp2.b); + if (gy4.a < p.M) buffer_st1(top_blob_data, gi4.b + 3, sumfp3.b); + } + if (gx4.a < p.N) + { + buffer_st1(top_blob_data, gi4.a, sumfp0.a); + if (gy4.g < p.M) buffer_st1(top_blob_data, gi4.a + 1, sumfp1.a); + if (gy4.b < p.M) buffer_st1(top_blob_data, gi4.a + 2, sumfp2.a); + if (gy4.a < p.M) buffer_st1(top_blob_data, gi4.a + 3, sumfp3.a); + } + } + else + { + const uvec4 gi4 = gy4 * uint(p.outhstep) + gx * 4; + + buffer_st1(top_blob_data, gi4.r, sumfp0.r); + if (gx4.g < p.N) buffer_st1(top_blob_data, gi4.r + 1, sumfp0.g); + if (gx4.b < p.N) buffer_st1(top_blob_data, gi4.r + 2, sumfp0.b); + if (gx4.a < p.N) buffer_st1(top_blob_data, gi4.r + 3, sumfp0.a); + if (gy4.g < p.M) + { + buffer_st1(top_blob_data, gi4.g, sumfp1.r); + if (gx4.g < p.N) buffer_st1(top_blob_data, gi4.g + 1, sumfp1.g); + if (gx4.b < p.N) buffer_st1(top_blob_data, gi4.g + 2, sumfp1.b); + if (gx4.a < p.N) buffer_st1(top_blob_data, gi4.g + 3, sumfp1.a); + } + if (gy4.b < p.M) + { + buffer_st1(top_blob_data, gi4.b, sumfp2.r); + if (gx4.g < p.N) buffer_st1(top_blob_data, gi4.b + 1, sumfp2.g); + if (gx4.b < p.N) buffer_st1(top_blob_data, gi4.b + 2, sumfp2.b); + if (gx4.a < p.N) buffer_st1(top_blob_data, gi4.b + 3, sumfp2.a); + } + if (gy4.a < p.M) + { + buffer_st1(top_blob_data, gi4.a, sumfp3.r); + if (gx4.g < p.N) buffer_st1(top_blob_data, gi4.a + 1, sumfp3.g); + if (gx4.b < p.N) buffer_st1(top_blob_data, gi4.a + 2, sumfp3.b); + if (gx4.a < p.N) buffer_st1(top_blob_data, gi4.a + 3, sumfp3.a); + } + } +} diff --git a/src/layer/vulkan/shader/gemm_int8_cm.comp b/src/layer/vulkan/shader/gemm_int8_cm.comp new file mode 100644 index 000000000000..26b0fd4a9275 --- /dev/null +++ b/src/layer/vulkan/shader/gemm_int8_cm.comp @@ -0,0 +1,1087 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#extension GL_EXT_control_flow_attributes : require + +#extension GL_KHR_shader_subgroup_basic : require + +#extension GL_KHR_memory_scope_semantics : require +#extension GL_EXT_shader_explicit_arithmetic_types : require +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require +#if ncnn_VK_KHR_cooperative_matrix +#extension GL_KHR_cooperative_matrix : require +#elif ncnn_VK_NV_cooperative_matrix +#extension GL_NV_cooperative_matrix : require +#extension GL_NV_integer_cooperative_matrix : require +#endif + +layout(constant_id = 0) const float alpha = 1.f; +layout(constant_id = 1) const float beta = 1.f; +layout(constant_id = 2) const int constantA = 0; +layout(constant_id = 3) const int constantB = 0; +layout(constant_id = 4) const int constantC = 0; +layout(constant_id = 5) const int constant_broadcast_type_C = 0; +layout(constant_id = 6) const int output_transpose = 0; +layout(constant_id = 7) const uint GM = 0; +layout(constant_id = 8) const uint GN = 0; +layout(constant_id = 9) const uint GK = 0; +layout(constant_id = 10) const uint out_elempack = 0; + +layout(constant_id = 11 + 0) const uint M = 1; +layout(constant_id = 11 + 1) const uint N = 1; +layout(constant_id = 11 + 2) const uint K = 1; +layout(constant_id = 11 + 3) const uint subgroup_size = 32; +layout(constant_id = 11 + 4) const uint UNROLL_SG_M = 2; +layout(constant_id = 11 + 5) const uint UNROLL_SG_N = 2; +layout(constant_id = 11 + 6) const uint UNROLL_SG_K = 2; +layout(constant_id = 11 + 7) const uint UNROLL_WG_M = 2; +layout(constant_id = 11 + 8) const uint UNROLL_WG_N = 2; + +layout(binding = 0) writeonly buffer top_blob { sfp top_blob_data[]; }; +layout(binding = 1) readonly buffer A_int8_blob { sint8vec4 A_int8_data[]; }; +layout(binding = 2) readonly buffer B_int8_blob { sint8vec4 B_int8_data[]; }; +layout(binding = 3) readonly buffer C_blob { sfp C_blob_data[]; }; +layout(binding = 4) readonly buffer A_descales_blob { float A_descales_data[]; }; +layout(binding = 5) readonly buffer B_descale_blob { float B_descale_data[]; }; +layout(binding = 6) writeonly buffer top_blob_4 { sfpvec4 top_blob_data_4[]; }; + +layout(push_constant) uniform parameter +{ + uint GM; + uint GN; + uint GK; + int broadcast_type_C; + uint outhstep; + uint out_elempack; +} p; + +// cannot alias output with a and b +// cm store may happen while another subgroup is loading +const uint Md4 = M / 4; +const uint Nd4 = N / 4; +const uint Kd4 = K / 4; + +// avoid bank conflict +#if ncnn_VK_KHR_cooperative_matrix +#define PAD 1 +#elif ncnn_VK_NV_cooperative_matrix +// fixme: pad causes incorrect result on old driver +#define PAD 0 +#endif + +const uint Nd4p = Nd4 + PAD; +const uint Kd4p = Kd4 + PAD; + +shared int tmp_a[UNROLL_WG_M][UNROLL_SG_K * UNROLL_SG_M * M * Kd4p]; +shared int tmp_b[UNROLL_WG_N][UNROLL_SG_K * UNROLL_SG_N * K * Nd4p]; +shared ivec4 tmp_o[UNROLL_WG_N * UNROLL_WG_M][UNROLL_SG_N * UNROLL_SG_M * M * Nd4]; + +void main() +{ + // neither gl_SubgroupSize nor gl_WorkGroupSize.x is a constant + const uint local_size = subgroup_size * UNROLL_WG_M * UNROLL_WG_N; + + // [ WG_UN * WG_UM * [ SG_UN * SG_UM * subgroup ] ] + + // <----WG_UN----> + // +---N--+-SG_UN+------+------+ + // | | | |XXXXXX| + // M | XXXX<----coopmat + // | | | |XXXXXX| + // +-- --SG0-- --+-- --SG2-- --+ + // | | | | | + // SG_UM | | + // | | | | | + // ^ +------+--WORKGROUP--+------+ + // | | | | | | + // | | | | + // | | | | | | + // WG_UM+-- --SG1-- --+-- --SG3-- --+ + // | | | | | | + // | | | | + // | | | | | | + // v +------+------+------+------+ + // + + const uint wgi = gl_WorkGroupID.x; + const uint sgi = gl_SubgroupID; + + const uint wgmm = (psc(GM) + M * UNROLL_SG_M * UNROLL_WG_M - 1) / (M * UNROLL_SG_M * UNROLL_WG_M); + const uint wgnn = (psc(GN) + N * UNROLL_SG_N * UNROLL_WG_N - 1) / (N * UNROLL_SG_N * UNROLL_WG_N); + + const uint wgmi = wgi / wgnn; + const uint wgni = wgi % wgnn; + + const uint sgmi = sgi / UNROLL_WG_N; + const uint sgni = sgi % UNROLL_WG_N; + + const uint kk = (psc(GK) + K - 1) / K; + + if (wgmi >= wgmm) + return; + + const uint si = gl_SubgroupInvocationID; + + const uint ni = (wgni * UNROLL_WG_N + sgni) * UNROLL_SG_N; + const uint mi = (wgmi * UNROLL_WG_M + sgmi) * UNROLL_SG_M; + +#if ncnn_VK_KHR_cooperative_matrix + coopmat sum[UNROLL_SG_N][UNROLL_SG_M]; +#elif ncnn_VK_NV_cooperative_matrix + icoopmatNV<32, gl_ScopeSubgroup, M, N> sum[UNROLL_SG_N][UNROLL_SG_M]; +#endif + + { + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { +#if ncnn_VK_KHR_cooperative_matrix + sum[zn][zm] = coopmat(0); +#elif ncnn_VK_NV_cooperative_matrix + sum[zn][zm] = icoopmatNV<32, gl_ScopeSubgroup, M, N>(0); +#endif + } + } + } + + uint k = 0; + + if (kk >= UNROLL_SG_K * 2) + { + // local stack and shared memory ping-pong + + // prefetch + int prefetch_tmp_a[(UNROLL_SG_M * UNROLL_SG_K * M * Kd4 + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N)]; + int prefetch_tmp_b[(UNROLL_SG_N * UNROLL_SG_K * K * Nd4 + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M)]; + + // prefetch the very first + { + const uint M_Kd4_USGM = M * Kd4 * UNROLL_SG_M; + const uint M_Kd4_USGM_USGK = M_Kd4_USGM * UNROLL_SG_K; + const uint M_Kd4_USGM_USGK_d_subgroupsize = (M_Kd4_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N); + [[unroll]] for (uint q = 0; q < M_Kd4_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (M_Kd4_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4_USGM_USGK) + { + int v = 0; + + if (constantA == 1) + { + const uint kk_full = kk / UNROLL_SG_K; + + if (0 < kk_full) + { + const uint a_offset = (wgmi * kk * UNROLL_WG_M) * M_Kd4_USGM + sgmi * M_Kd4_USGM_USGK; + v = i8buffer_sm4(A_int8_data, a_offset + siq); + } + else + { + const uint zk = siq / M_Kd4_USGM; + const uint sij = siq % M_Kd4_USGM; + + if (zk < kk) + { + const uint tail_offset = (wgmi * kk * UNROLL_WG_M + sgmi) * M_Kd4_USGM; + v = i8buffer_sm4(A_int8_data, tail_offset + sij); + } + } + } + else + { + const uint zk = siq / M_Kd4_USGM; + const uint zmij = siq % M_Kd4_USGM; + const uint zm = zmij / (M * Kd4); + const uint ij = zmij % (M * Kd4); + const uint i = ij / Kd4; + const uint j = ij % Kd4; + + const uint gm = (mi + zm) * M + i; + const uint gk = zk * K + j * 4; + + if (gm < psc(GM) && gk < psc(GK)) + { + if (psc(GK) % 4 == 0) + { + v = i8buffer_sm4(A_int8_data, (gm * psc(GK) + gk) / 4); + } + else + { + const uint ai = gm * psc(GK) + gk; + const uint aim4 = ai % 4; + const ivec4 v0 = i8buffer_ld4(A_int8_data, ai / 4); + ivec4 v1 = ivec4(0); + if (gk + 4 - aim4 < psc(GK)) v1 = i8buffer_ld4(A_int8_data, ai / 4 + 1); + ivec4 v4; + if (aim4 == 0) + { + v4 = v0; + } + else if (aim4 == 1) + { + v4 = ivec4(v0.g, v0.b, v0.a, v1.r); + } + else if (aim4 == 2) + { + v4 = ivec4(v0.b, v0.a, v1.r, v1.g); + } + else + { + v4 = ivec4(v0.a, v1.r, v1.g, v1.b); + } + if (gk + 1 >= psc(GK)) v4.g = 0; + if (gk + 2 >= psc(GK)) v4.b = 0; + if (gk + 3 >= psc(GK)) v4.a = 0; + v = packInt4x8(v4); + } + } + } + + prefetch_tmp_a[q] = v; + } + } + } + { + const uint K_Nd4_USGN = K * Nd4 * UNROLL_SG_N; + const uint K_Nd4_USGN_USGK = K_Nd4_USGN * UNROLL_SG_K; + const uint K_Nd4_USGN_USGK_d_subgroupsize = (K_Nd4_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M); + [[unroll]] for (uint q = 0; q < K_Nd4_USGN_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si; + + if (K_Nd4_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4_USGN_USGK) + { + int v = 0; + + if (constantB == 1) + { + const uint kk_full = kk / UNROLL_SG_K; + + if (0 < kk_full) + { + const uint b_offset = (wgni * kk * UNROLL_WG_N) * K_Nd4_USGN + sgni * K_Nd4_USGN_USGK; + v = i8buffer_sm4(B_int8_data, b_offset + siq); + } + else + { + const uint zk = siq / K_Nd4_USGN; + const uint sij = siq % K_Nd4_USGN; + + if (zk < kk) + { + const uint tail_offset = (wgni * kk * UNROLL_WG_N + sgni) * K_Nd4_USGN; + v = i8buffer_sm4(B_int8_data, tail_offset + sij); + } + } + } + else + { + const uint zk = siq / K_Nd4_USGN; + const uint znij = siq % K_Nd4_USGN; + const uint zn = znij / (K * Nd4); + const uint ij = znij % (K * Nd4); + const uint i = ij / Nd4; + const uint j = ij % Nd4; + + const uint gk = zk * K + i; + const uint gn = (ni + zn) * N + j * 4; + + if (gn < psc(GN) && gk < psc(GK)) + { + ivec4 v4 = ivec4(0); + uint bi = gn * psc(GK) + gk; + v4.r = i8buffer_ld4(B_int8_data, bi / 4)[bi % 4]; + if (gn + 1 < psc(GN)) + { + bi += psc(GK); + v4.g = i8buffer_ld4(B_int8_data, bi / 4)[bi % 4]; + } + if (gn + 2 < psc(GN)) + { + bi += psc(GK); + v4.b = i8buffer_ld4(B_int8_data, bi / 4)[bi % 4]; + } + if (gn + 3 < psc(GN)) + { + bi += psc(GK); + v4.a = i8buffer_ld4(B_int8_data, bi / 4)[bi % 4]; + } + v = packInt4x8(v4); + } + } + + prefetch_tmp_b[q] = v; + } + } + } + + for (; k + UNROLL_SG_K < kk; k += UNROLL_SG_K) + { + // copy prefetched tile to shared memory + { + const uint M_Kd4_USGM = M * Kd4 * UNROLL_SG_M; + const uint M_Kd4_USGM_USGK = M_Kd4_USGM * UNROLL_SG_K; + const uint M_Kd4_USGM_USGK_d_subgroupsize = (M_Kd4_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N); + [[unroll]] for (uint q = 0; q < M_Kd4_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (M_Kd4_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4_USGM_USGK) + { +#if ncnn_VK_KHR_cooperative_matrix + const uint zk = siq / M_Kd4_USGM; + const uint zmij = siq % M_Kd4_USGM; + const uint zm = zmij / (M * Kd4); + const uint ij = zmij % (M * Kd4); + const uint i = ij / Kd4; + const uint j = ij % Kd4; + + tmp_a[sgmi][((zk * UNROLL_SG_M + zm) * M + i) * Kd4p + j] = prefetch_tmp_a[q]; +#elif ncnn_VK_NV_cooperative_matrix + tmp_a[sgmi][siq] = prefetch_tmp_a[q]; +#endif + } + } + } + { + const uint K_Nd4_USGN = K * Nd4 * UNROLL_SG_N; + const uint K_Nd4_USGN_USGK = K_Nd4_USGN * UNROLL_SG_K; + const uint K_Nd4_USGN_USGK_d_subgroupsize = (K_Nd4_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M); + [[unroll]] for (uint q = 0; q < K_Nd4_USGN_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si; + + if (K_Nd4_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4_USGN_USGK) + { +#if ncnn_VK_KHR_cooperative_matrix + const uint zk = siq / K_Nd4_USGN; + const uint znij = siq % K_Nd4_USGN; + const uint zn = znij / (K * Nd4); + const uint ij = znij % (K * Nd4); + const uint i = ij / Nd4; + const uint j = ij % Nd4; + + tmp_b[sgni][((zk * UNROLL_SG_N + zn) * K + i) * Nd4p + j] = prefetch_tmp_b[q]; +#elif ncnn_VK_NV_cooperative_matrix + tmp_b[sgni][siq] = prefetch_tmp_b[q]; +#endif + } + } + } + + barrier(); + + // prefetch next tile + const uint ki = k + UNROLL_SG_K; + { + const uint M_Kd4_USGM = M * Kd4 * UNROLL_SG_M; + const uint M_Kd4_USGM_USGK = M_Kd4_USGM * UNROLL_SG_K; + const uint M_Kd4_USGM_USGK_d_subgroupsize = (M_Kd4_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N); + [[unroll]] for (uint q = 0; q < M_Kd4_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (M_Kd4_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4_USGM_USGK) + { + int v = 0; + + if (constantA == 1) + { + const uint kk_full = kk / UNROLL_SG_K; + + if (ki / UNROLL_SG_K < kk_full) + { + const uint a_offset = (wgmi * kk * UNROLL_WG_M) * M_Kd4_USGM + ((ki / UNROLL_SG_K) * UNROLL_WG_M + sgmi) * M_Kd4_USGM_USGK; + v = i8buffer_sm4(A_int8_data, a_offset + siq); + } + else + { + const uint zk = siq / M_Kd4_USGM; + const uint sij = siq % M_Kd4_USGM; + + if (ki + zk < kk) + { + const uint tail_offset = (wgmi * kk * UNROLL_WG_M + kk_full * UNROLL_WG_M * UNROLL_SG_K + (ki + zk - kk_full * UNROLL_SG_K) * UNROLL_WG_M + sgmi) * M_Kd4_USGM; + v = i8buffer_sm4(A_int8_data, tail_offset + sij); + } + } + } + else + { + const uint zk = siq / M_Kd4_USGM; + const uint zmij = siq % M_Kd4_USGM; + const uint zm = zmij / (M * Kd4); + const uint ij = zmij % (M * Kd4); + const uint i = ij / Kd4; + const uint j = ij % Kd4; + + const uint gm = (mi + zm) * M + i; + const uint gk = (ki + zk) * K + j * 4; + + if (gm < psc(GM) && gk < psc(GK)) + { + if (psc(GK) % 4 == 0) + { + v = i8buffer_sm4(A_int8_data, (gm * psc(GK) + gk) / 4); + } + else + { + const uint ai = gm * psc(GK) + gk; + const uint aim4 = ai % 4; + const ivec4 v0 = i8buffer_ld4(A_int8_data, ai / 4); + ivec4 v1 = ivec4(0); + if (gk + 4 - aim4 < psc(GK)) v1 = i8buffer_ld4(A_int8_data, ai / 4 + 1); + ivec4 v4; + if (aim4 == 0) + { + v4 = v0; + } + else if (aim4 == 1) + { + v4 = ivec4(v0.g, v0.b, v0.a, v1.r); + } + else if (aim4 == 2) + { + v4 = ivec4(v0.b, v0.a, v1.r, v1.g); + } + else + { + v4 = ivec4(v0.a, v1.r, v1.g, v1.b); + } + if (gk + 1 >= psc(GK)) v4.g = 0; + if (gk + 2 >= psc(GK)) v4.b = 0; + if (gk + 3 >= psc(GK)) v4.a = 0; + v = packInt4x8(v4); + } + } + } + + prefetch_tmp_a[q] = v; + } + } + } + { + const uint K_Nd4_USGN = K * Nd4 * UNROLL_SG_N; + const uint K_Nd4_USGN_USGK = K_Nd4_USGN * UNROLL_SG_K; + const uint K_Nd4_USGN_USGK_d_subgroupsize = (K_Nd4_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M); + [[unroll]] for (uint q = 0; q < K_Nd4_USGN_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si; + + if (K_Nd4_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4_USGN_USGK) + { + int v = 0; + + if (constantB == 1) + { + const uint kk_full = kk / UNROLL_SG_K; + + if (ki / UNROLL_SG_K < kk_full) + { + const uint b_offset = (wgni * kk * UNROLL_WG_N) * K_Nd4_USGN + ((ki / UNROLL_SG_K) * UNROLL_WG_N + sgni) * K_Nd4_USGN_USGK; + v = i8buffer_sm4(B_int8_data, b_offset + siq); + } + else + { + const uint zk = siq / K_Nd4_USGN; + const uint sij = siq % K_Nd4_USGN; + + if (ki + zk < kk) + { + const uint tail_offset = (wgni * kk * UNROLL_WG_N + kk_full * UNROLL_WG_N * UNROLL_SG_K + (ki + zk - kk_full * UNROLL_SG_K) * UNROLL_WG_N + sgni) * K_Nd4_USGN; + v = i8buffer_sm4(B_int8_data, tail_offset + sij); + } + } + } + else + { + const uint zk = siq / K_Nd4_USGN; + const uint znij = siq % K_Nd4_USGN; + const uint zn = znij / (K * Nd4); + const uint ij = znij % (K * Nd4); + const uint i = ij / Nd4; + const uint j = ij % Nd4; + + const uint gk = (ki + zk) * K + i; + const uint gn = (ni + zn) * N + j * 4; + + if (gn < psc(GN) && gk < psc(GK)) + { + ivec4 v4 = ivec4(0); + uint bi = gn * psc(GK) + gk; + v4.r = i8buffer_ld4(B_int8_data, bi / 4)[bi % 4]; + if (gn + 1 < psc(GN)) + { + bi += psc(GK); + v4.g = i8buffer_ld4(B_int8_data, bi / 4)[bi % 4]; + } + if (gn + 2 < psc(GN)) + { + bi += psc(GK); + v4.b = i8buffer_ld4(B_int8_data, bi / 4)[bi % 4]; + } + if (gn + 3 < psc(GN)) + { + bi += psc(GK); + v4.a = i8buffer_ld4(B_int8_data, bi / 4)[bi % 4]; + } + v = packInt4x8(v4); + } + } + + prefetch_tmp_b[q] = v; + } + } + } + +#if ncnn_VK_KHR_cooperative_matrix + coopmat A[UNROLL_SG_M]; + coopmat B[UNROLL_SG_N]; +#elif ncnn_VK_NV_cooperative_matrix + icoopmatNV<8, gl_ScopeSubgroup, M, K> A[UNROLL_SG_M]; + icoopmatNV<8, gl_ScopeSubgroup, K, N> B[UNROLL_SG_N]; +#endif + + [[unroll]] for (uint zk = 0; zk < UNROLL_SG_K; zk++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(A[zm], tmp_a[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(A[zm], tmp_a[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false); +#endif + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(B[zn], tmp_b[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(B[zn], tmp_b[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, false); +#endif + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { +#if ncnn_VK_KHR_cooperative_matrix + sum[zn][zm] = coopMatMulAdd(A[zm], B[zn], sum[zn][zm]); +#elif ncnn_VK_NV_cooperative_matrix + sum[zn][zm] = coopMatMulAddNV(A[zm], B[zn], sum[zn][zm]); +#endif + } + } + } + + barrier(); + } + + // copy and compute the last prefetched tile + { + const uint M_Kd4_USGM = M * Kd4 * UNROLL_SG_M; + const uint M_Kd4_USGM_USGK = M_Kd4_USGM * UNROLL_SG_K; + const uint M_Kd4_USGM_USGK_d_subgroupsize = (M_Kd4_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N); + [[unroll]] for (uint q = 0; q < M_Kd4_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (M_Kd4_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4_USGM_USGK) + { +#if ncnn_VK_KHR_cooperative_matrix + const uint zk = siq / M_Kd4_USGM; + const uint zmij = siq % M_Kd4_USGM; + const uint zm = zmij / (M * Kd4); + const uint ij = zmij % (M * Kd4); + const uint i = ij / Kd4; + const uint j = ij % Kd4; + + tmp_a[sgmi][((zk * UNROLL_SG_M + zm) * M + i) * Kd4p + j] = prefetch_tmp_a[q]; +#elif ncnn_VK_NV_cooperative_matrix + tmp_a[sgmi][siq] = prefetch_tmp_a[q]; +#endif + } + } + } + { + const uint K_Nd4_USGN = K * Nd4 * UNROLL_SG_N; + const uint K_Nd4_USGN_USGK = K_Nd4_USGN * UNROLL_SG_K; + const uint K_Nd4_USGN_USGK_d_subgroupsize = (K_Nd4_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M); + [[unroll]] for (uint q = 0; q < K_Nd4_USGN_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si; + + if (K_Nd4_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4_USGN_USGK) + { +#if ncnn_VK_KHR_cooperative_matrix + const uint zk = siq / K_Nd4_USGN; + const uint znij = siq % K_Nd4_USGN; + const uint zn = znij / (K * Nd4); + const uint ij = znij % (K * Nd4); + const uint i = ij / Nd4; + const uint j = ij % Nd4; + + tmp_b[sgni][((zk * UNROLL_SG_N + zn) * K + i) * Nd4p + j] = prefetch_tmp_b[q]; +#elif ncnn_VK_NV_cooperative_matrix + tmp_b[sgni][siq] = prefetch_tmp_b[q]; +#endif + } + } + } + + barrier(); + +#if ncnn_VK_KHR_cooperative_matrix + coopmat A[UNROLL_SG_M]; + coopmat B[UNROLL_SG_N]; +#elif ncnn_VK_NV_cooperative_matrix + icoopmatNV<8, gl_ScopeSubgroup, M, K> A[UNROLL_SG_M]; + icoopmatNV<8, gl_ScopeSubgroup, K, N> B[UNROLL_SG_N]; +#endif + + [[unroll]] for (uint zk = 0; zk < UNROLL_SG_K; zk++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(A[zm], tmp_a[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(A[zm], tmp_a[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false); +#endif + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(B[zn], tmp_b[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(B[zn], tmp_b[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, false); +#endif + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { +#if ncnn_VK_KHR_cooperative_matrix + sum[zn][zm] = coopMatMulAdd(A[zm], B[zn], sum[zn][zm]); +#elif ncnn_VK_NV_cooperative_matrix + sum[zn][zm] = coopMatMulAddNV(A[zm], B[zn], sum[zn][zm]); +#endif + } + } + } + + barrier(); + } + else + { + for (uint ki = 0; ki < kk; ki += UNROLL_SG_K) + { + { + const uint M_Kd4_USGM = M * Kd4 * UNROLL_SG_M; + const uint M_Kd4_USGM_USGK = M_Kd4_USGM * UNROLL_SG_K; + const uint M_Kd4_USGM_USGK_d_subgroupsize = (M_Kd4_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N); + [[unroll]] for (uint q = 0; q < M_Kd4_USGM_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si; + + if (M_Kd4_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4_USGM_USGK) + { + int v = 0; + + if (constantA == 1) + { + const uint kk_full = kk / UNROLL_SG_K; + + if (ki / UNROLL_SG_K < kk_full) + { + const uint a_offset = (wgmi * kk * UNROLL_WG_M) * M_Kd4_USGM + ((ki / UNROLL_SG_K) * UNROLL_WG_M + sgmi) * M_Kd4_USGM_USGK; + v = i8buffer_sm4(A_int8_data, a_offset + siq); + } + else + { + const uint zk = siq / M_Kd4_USGM; + const uint sij = siq % M_Kd4_USGM; + + if (ki + zk < kk) + { + const uint tail_offset = (wgmi * kk * UNROLL_WG_M + kk_full * UNROLL_WG_M * UNROLL_SG_K + (ki + zk - kk_full * UNROLL_SG_K) * UNROLL_WG_M + sgmi) * M_Kd4_USGM; + v = i8buffer_sm4(A_int8_data, tail_offset + sij); + } + } + } + else + { + const uint zk = siq / M_Kd4_USGM; + const uint zmij = siq % M_Kd4_USGM; + const uint zm = zmij / (M * Kd4); + const uint ij = zmij % (M * Kd4); + const uint i = ij / Kd4; + const uint j = ij % Kd4; + + const uint gm = (mi + zm) * M + i; + const uint gk = (ki + zk) * K + j * 4; + + if (gm < psc(GM) && gk < psc(GK)) + { + if (psc(GK) % 4 == 0) + { + v = i8buffer_sm4(A_int8_data, (gm * psc(GK) + gk) / 4); + } + else + { + const uint ai = gm * psc(GK) + gk; + const uint aim4 = ai % 4; + const ivec4 v0 = i8buffer_ld4(A_int8_data, ai / 4); + ivec4 v1 = ivec4(0); + if (gk + 4 - aim4 < psc(GK)) v1 = i8buffer_ld4(A_int8_data, ai / 4 + 1); + ivec4 v4; + if (aim4 == 0) + { + v4 = v0; + } + else if (aim4 == 1) + { + v4 = ivec4(v0.g, v0.b, v0.a, v1.r); + } + else if (aim4 == 2) + { + v4 = ivec4(v0.b, v0.a, v1.r, v1.g); + } + else + { + v4 = ivec4(v0.a, v1.r, v1.g, v1.b); + } + if (gk + 1 >= psc(GK)) v4.g = 0; + if (gk + 2 >= psc(GK)) v4.b = 0; + if (gk + 3 >= psc(GK)) v4.a = 0; + v = packInt4x8(v4); + } + } + } + +#if ncnn_VK_KHR_cooperative_matrix + const uint zk = siq / M_Kd4_USGM; + const uint zmij = siq % M_Kd4_USGM; + const uint zm = zmij / (M * Kd4); + const uint ij = zmij % (M * Kd4); + const uint i = ij / Kd4; + const uint j = ij % Kd4; + + tmp_a[sgmi][((zk * UNROLL_SG_M + zm) * M + i) * Kd4p + j] = v; +#elif ncnn_VK_NV_cooperative_matrix + tmp_a[sgmi][siq] = v; +#endif + } + } + } + { + const uint K_Nd4_USGN = K * Nd4 * UNROLL_SG_N; + const uint K_Nd4_USGN_USGK = K_Nd4_USGN * UNROLL_SG_K; + const uint K_Nd4_USGN_USGK_d_subgroupsize = (K_Nd4_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M); + [[unroll]] for (uint q = 0; q < K_Nd4_USGN_USGK_d_subgroupsize; q++) + { + const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si; + + if (K_Nd4_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4_USGN_USGK) + { + int v = 0; + + if (constantB == 1) + { + const uint kk_full = kk / UNROLL_SG_K; + + if (ki / UNROLL_SG_K < kk_full) + { + const uint b_offset = (wgni * kk * UNROLL_WG_N) * K_Nd4_USGN + ((ki / UNROLL_SG_K) * UNROLL_WG_N + sgni) * K_Nd4_USGN_USGK; + v = i8buffer_sm4(B_int8_data, b_offset + siq); + } + else + { + const uint zk = siq / K_Nd4_USGN; + const uint sij = siq % K_Nd4_USGN; + + if (ki + zk < kk) + { + const uint tail_offset = (wgni * kk * UNROLL_WG_N + kk_full * UNROLL_WG_N * UNROLL_SG_K + (ki + zk - kk_full * UNROLL_SG_K) * UNROLL_WG_N + sgni) * K_Nd4_USGN; + v = i8buffer_sm4(B_int8_data, tail_offset + sij); + } + } + } + else + { + const uint zk = siq / K_Nd4_USGN; + const uint znij = siq % K_Nd4_USGN; + const uint zn = znij / (K * Nd4); + const uint ij = znij % (K * Nd4); + const uint i = ij / Nd4; + const uint j = ij % Nd4; + + const uint gk = (ki + zk) * K + i; + const uint gn = (ni + zn) * N + j * 4; + + if (gn < psc(GN) && gk < psc(GK)) + { + ivec4 v4 = ivec4(0); + uint bi = gn * psc(GK) + gk; + v4.r = i8buffer_ld4(B_int8_data, bi / 4)[bi % 4]; + if (gn + 1 < psc(GN)) + { + bi += psc(GK); + v4.g = i8buffer_ld4(B_int8_data, bi / 4)[bi % 4]; + } + if (gn + 2 < psc(GN)) + { + bi += psc(GK); + v4.b = i8buffer_ld4(B_int8_data, bi / 4)[bi % 4]; + } + if (gn + 3 < psc(GN)) + { + bi += psc(GK); + v4.a = i8buffer_ld4(B_int8_data, bi / 4)[bi % 4]; + } + v = packInt4x8(v4); + } + } + +#if ncnn_VK_KHR_cooperative_matrix + const uint zk = siq / K_Nd4_USGN; + const uint znij = siq % K_Nd4_USGN; + const uint zn = znij / (K * Nd4); + const uint ij = znij % (K * Nd4); + const uint i = ij / Nd4; + const uint j = ij % Nd4; + + tmp_b[sgni][((zk * UNROLL_SG_N + zn) * K + i) * Nd4p + j] = v; +#elif ncnn_VK_NV_cooperative_matrix + tmp_b[sgni][siq] = v; +#endif + } + } + } + + barrier(); + +#if ncnn_VK_KHR_cooperative_matrix + coopmat A[UNROLL_SG_M]; + coopmat B[UNROLL_SG_N]; +#elif ncnn_VK_NV_cooperative_matrix + icoopmatNV<8, gl_ScopeSubgroup, M, K> A[UNROLL_SG_M]; + icoopmatNV<8, gl_ScopeSubgroup, K, N> B[UNROLL_SG_N]; +#endif + + [[unroll]] for (uint zk = 0; zk < UNROLL_SG_K; zk++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(A[zm], tmp_a[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(A[zm], tmp_a[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false); +#endif + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatLoad(B[zn], tmp_b[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatLoadNV(B[zn], tmp_b[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, false); +#endif + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { +#if ncnn_VK_KHR_cooperative_matrix + sum[zn][zm] = coopMatMulAdd(A[zm], B[zn], sum[zn][zm]); +#elif ncnn_VK_NV_cooperative_matrix + sum[zn][zm] = coopMatMulAddNV(A[zm], B[zn], sum[zn][zm]); +#endif + } + } + } + + barrier(); + } + } + + [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++) + { + [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++) + { + if (output_transpose == 0 && psc(out_elempack) == 4) + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatStore(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (Md4 * N), Md4, gl_CooperativeMatrixLayoutColumnMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatStoreNV(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (Md4 * N), Md4, true); +#endif + } + else + { +#if ncnn_VK_KHR_cooperative_matrix + coopMatStore(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (M * Nd4), Nd4, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + coopMatStoreNV(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (M * Nd4), Nd4, false); +#endif + } + } + } + + barrier(); + + const int broadcast_type_C = constantC == 1 ? constant_broadcast_type_C : p.broadcast_type_C; + + if (output_transpose == 0 && psc(out_elempack) == 4) + { + const uint Md4_N_USGM_USGN = Md4 * N * UNROLL_SG_M * UNROLL_SG_N; + const uint Md4_N_USGM_USGN_d_subgroupsize = (Md4_N_USGM_USGN + subgroup_size - 1) / subgroup_size; + [[unroll]] for (uint q = 0; q < Md4_N_USGM_USGN_d_subgroupsize; q++) + { + const uint siq = si + q * subgroup_size; + + if (Md4_N_USGM_USGN % subgroup_size == 0 || siq < Md4_N_USGM_USGN) + { + const uint zn = siq / (Md4 * N * UNROLL_SG_M); + const uint zmij = siq % (Md4 * N * UNROLL_SG_M); + const uint zm = zmij / (Md4 * N); + const uint ij = zmij % (Md4 * N); + const uint i = ij / Md4; + const uint j = ij % Md4; + + const uint gn = (ni + zn) * N + i; + const uint gm = (mi + zm) * Md4 + j; + + if (gm * 4 < psc(GM) && gn < psc(GN)) + { + ivec4 sumi = tmp_o[sgi][siq]; + const uvec4 gm4 = gm * 4 + uvec4(0, 1, 2, 3); + + vec4 descale = vec4(0.f); + descale.r = A_descales_data[gm4.r]; + if (gm4.g < psc(GM)) descale.g = A_descales_data[gm4.g]; + if (gm4.b < psc(GM)) descale.b = A_descales_data[gm4.b]; + if (gm4.a < psc(GM)) descale.a = A_descales_data[gm4.a]; + + vec4 sumfp = vec4(sumi) * descale * B_descale_data[0]; + + if (broadcast_type_C == 0) + { + sumfp += vec4(float(buffer_ld1(C_blob_data, 0)) * beta); + } + if (broadcast_type_C == 1 || broadcast_type_C == 2) + { + sumfp.r += float(buffer_ld1(C_blob_data, gm4.r)) * beta; + if (gm4.g < psc(GM)) sumfp.g += float(buffer_ld1(C_blob_data, gm4.g)) * beta; + if (gm4.b < psc(GM)) sumfp.b += float(buffer_ld1(C_blob_data, gm4.b)) * beta; + if (gm4.a < psc(GM)) sumfp.a += float(buffer_ld1(C_blob_data, gm4.a)) * beta; + } + if (broadcast_type_C == 3) + { + sumfp.r += float(buffer_ld1(C_blob_data, gm4.r * psc(GN) + gn)) * beta; + if (gm4.g < psc(GM)) sumfp.g += float(buffer_ld1(C_blob_data, gm4.g * psc(GN) + gn)) * beta; + if (gm4.b < psc(GM)) sumfp.b += float(buffer_ld1(C_blob_data, gm4.b * psc(GN) + gn)) * beta; + if (gm4.a < psc(GM)) sumfp.a += float(buffer_ld1(C_blob_data, gm4.a * psc(GN) + gn)) * beta; + } + if (broadcast_type_C == 4) + { + sumfp += vec4(float(buffer_ld1(C_blob_data, gn)) * beta); + } + + if (alpha != 1.f) + { + sumfp *= alpha; + } + + buffer_st4(top_blob_data_4, gm * p.outhstep + gn, afpvec4(sumfp)); + } + } + } + } + else + { + const uint M_Nd4_USGM_USGN = M * Nd4 * UNROLL_SG_M * UNROLL_SG_N; + const uint M_Nd4_USGM_USGN_d_subgroupsize = (M_Nd4_USGM_USGN + subgroup_size - 1) / subgroup_size; + [[unroll]] for (uint q = 0; q < M_Nd4_USGM_USGN_d_subgroupsize; q++) + { + const uint siq = si + q * subgroup_size; + + if (M_Nd4_USGM_USGN % subgroup_size == 0 || siq < M_Nd4_USGM_USGN) + { + const uint zn = siq / (M * Nd4 * UNROLL_SG_M); + const uint zmij = siq % (M * Nd4 * UNROLL_SG_M); + const uint zm = zmij / (M * Nd4); + const uint ij = zmij % (M * Nd4); + const uint i = ij / Nd4; + const uint j = ij % Nd4; + + const uint gm = (mi + zm) * M + i; + const uint gn = (ni + zn) * N + j * 4; + + if (gm < psc(GM) && gn < psc(GN)) + { + ivec4 sumi = tmp_o[sgi][siq]; + vec4 sumfp = vec4(sumi) * A_descales_data[gm] * B_descale_data[0]; + + if (broadcast_type_C == 0) + { + sumfp += vec4(float(buffer_ld1(C_blob_data, 0)) * beta); + } + if (broadcast_type_C == 1 || broadcast_type_C == 2) + { + sumfp += vec4(float(buffer_ld1(C_blob_data, gm)) * beta); + } + if (broadcast_type_C == 3) + { + sumfp.r += float(buffer_ld1(C_blob_data, gm * psc(GN) + gn)) * beta; + if (gn + 1 < psc(GN)) sumfp.g += float(buffer_ld1(C_blob_data, gm * psc(GN) + gn + 1)) * beta; + if (gn + 2 < psc(GN)) sumfp.b += float(buffer_ld1(C_blob_data, gm * psc(GN) + gn + 2)) * beta; + if (gn + 3 < psc(GN)) sumfp.a += float(buffer_ld1(C_blob_data, gm * psc(GN) + gn + 3)) * beta; + } + if (broadcast_type_C == 4) + { + sumfp.r += float(buffer_ld1(C_blob_data, gn)) * beta; + if (gn + 1 < psc(GN)) sumfp.g += float(buffer_ld1(C_blob_data, gn + 1)) * beta; + if (gn + 2 < psc(GN)) sumfp.b += float(buffer_ld1(C_blob_data, gn + 2)) * beta; + if (gn + 3 < psc(GN)) sumfp.a += float(buffer_ld1(C_blob_data, gn + 3)) * beta; + } + + if (alpha != 1.f) + { + sumfp *= alpha; + } + + if (output_transpose == 1) + { + if (psc(out_elempack) == 4) + { + buffer_st4(top_blob_data_4, (gn / 4) * p.outhstep + gm, afpvec4(sumfp)); + } + else + { + buffer_st1(top_blob_data, gn * p.outhstep + gm, sumfp.r); + if (gn + 1 < psc(GN)) buffer_st1(top_blob_data, (gn + 1) * p.outhstep + gm, sumfp.g); + if (gn + 2 < psc(GN)) buffer_st1(top_blob_data, (gn + 2) * p.outhstep + gm, sumfp.b); + if (gn + 3 < psc(GN)) buffer_st1(top_blob_data, (gn + 3) * p.outhstep + gm, sumfp.a); + } + } + else + { + buffer_st1(top_blob_data, gm * p.outhstep + gn, sumfp.r); + if (gn + 1 < psc(GN)) buffer_st1(top_blob_data, gm * p.outhstep + gn + 1, sumfp.g); + if (gn + 2 < psc(GN)) buffer_st1(top_blob_data, gm * p.outhstep + gn + 2, sumfp.b); + if (gn + 3 < psc(GN)) buffer_st1(top_blob_data, gm * p.outhstep + gn + 3, sumfp.a); + } + } + } + } + } +} diff --git a/src/layer/vulkan/shader/gemm_quantize_A_int8.comp b/src/layer/vulkan/shader/gemm_quantize_A_int8.comp new file mode 100644 index 000000000000..ab5c97518a3f --- /dev/null +++ b/src/layer/vulkan/shader/gemm_quantize_A_int8.comp @@ -0,0 +1,46 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout(constant_id = 0) const int transA = 0; + +layout(binding = 0) readonly buffer A_blob { sfp A_blob_data[]; }; +layout(binding = 1) writeonly buffer A_int8_blob { sint8 A_int8_data[]; }; +layout(binding = 2) writeonly buffer A_descales_blob { float A_descales_data[]; }; + +layout(push_constant) uniform parameter +{ + int M; + int K; + int A_dims; + int A_hstep; +} p; + +void main() +{ + const int i = int(gl_GlobalInvocationID.x); + + if (i >= p.M) + return; + + float absmax = 0.f; + + for (int k = 0; k < p.K; k++) + { + const int ai = transA == 1 ? k * p.A_hstep + i : i * p.A_hstep + k; + const float v = float(buffer_ld1(A_blob_data, ai)); + absmax = max(absmax, abs(v)); + } + + const float scale = absmax == 0.f ? 1.f : 127.f / absmax; + A_descales_data[i] = absmax == 0.f ? 0.f : absmax * (1.f / 127.f); + + for (int k = 0; k < p.K; k++) + { + const int ai = transA == 1 ? k * p.A_hstep + i : i * p.A_hstep + k; + const float v = float(buffer_ld1(A_blob_data, ai)) * scale; + const int vi = float2int8(v); + i8buffer_st1(A_int8_data, i * p.K + k, vi); + } +} diff --git a/src/layer/vulkan/shader/gemm_quantize_B_absmax_int8.comp b/src/layer/vulkan/shader/gemm_quantize_B_absmax_int8.comp new file mode 100644 index 000000000000..13a77bffe7e6 --- /dev/null +++ b/src/layer/vulkan/shader/gemm_quantize_B_absmax_int8.comp @@ -0,0 +1,53 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout(constant_id = 0) const int transB = 0; + +layout(binding = 0) readonly buffer B_blob { sfp B_blob_data[]; }; +layout(binding = 1) writeonly buffer B_absmax_blob { float B_absmax_data[]; }; + +layout(push_constant) uniform parameter +{ + int N; + int K; + int B_dims; + int B_hstep; + int size; +} p; + +shared float absmax_shared[128]; + +void main() +{ + const int lid = int(gl_LocalInvocationID.x); + const int block_offset = int(gl_WorkGroupID.x) * 1024; + const int block_end = min(block_offset + 1024, p.size); + + float absmax = 0.f; + + for (int i = block_offset + lid; i < block_end; i += 128) + { + const int j = i / p.K; + const int k = i - j * p.K; + const int bi = transB == 1 ? j * p.B_hstep + k : k * p.B_hstep + j; + const float v = float(buffer_ld1(B_blob_data, bi)); + absmax = max(absmax, abs(v)); + } + + absmax_shared[lid] = absmax; + + barrier(); + + for (int stride = 64; stride > 0; stride >>= 1) + { + if (lid < stride) + absmax_shared[lid] = max(absmax_shared[lid], absmax_shared[lid + stride]); + + barrier(); + } + + if (lid == 0) + B_absmax_data[int(gl_WorkGroupID.x)] = absmax_shared[0]; +} diff --git a/src/layer/vulkan/shader/gemm_quantize_B_descale_int8.comp b/src/layer/vulkan/shader/gemm_quantize_B_descale_int8.comp new file mode 100644 index 000000000000..64e51ed76c66 --- /dev/null +++ b/src/layer/vulkan/shader/gemm_quantize_B_descale_int8.comp @@ -0,0 +1,44 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout(binding = 0) readonly buffer B_absmax_blob { float B_absmax_data[]; }; +layout(binding = 1) writeonly buffer B_descale_blob { float B_descale_data[]; }; + +layout(push_constant) uniform parameter +{ + int blocks; +} p; + +shared float absmax_shared[128]; + +void main() +{ + const int lid = int(gl_LocalInvocationID.x); + + float absmax = 0.f; + + for (int i = lid; i < p.blocks; i += 128) + { + absmax = max(absmax, B_absmax_data[i]); + } + + absmax_shared[lid] = absmax; + + barrier(); + + for (int stride = 64; stride > 0; stride >>= 1) + { + if (lid < stride) + absmax_shared[lid] = max(absmax_shared[lid], absmax_shared[lid + stride]); + + barrier(); + } + + if (lid == 0) + { + absmax = absmax_shared[0]; + B_descale_data[0] = absmax == 0.f ? 0.f : absmax * (1.f / 127.f); + } +} diff --git a/src/layer/vulkan/shader/gemm_quantize_B_int8.comp b/src/layer/vulkan/shader/gemm_quantize_B_int8.comp new file mode 100644 index 000000000000..704a5ca340f0 --- /dev/null +++ b/src/layer/vulkan/shader/gemm_quantize_B_int8.comp @@ -0,0 +1,57 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout(constant_id = 0) const int transB = 0; + +layout(binding = 0) readonly buffer B_blob { sfp B_blob_data[]; }; +layout(binding = 1) writeonly buffer B_int8_blob { sint8 B_int8_data[]; }; +layout(binding = 2) readonly buffer B_descale_blob { float B_descale_data[]; }; +layout(binding = 3) writeonly buffer B_int8_blob_pack4 { sint8vec4 B_int8_data_pack4[]; }; + +layout(push_constant) uniform parameter +{ + int N; + int K; + int B_dims; + int B_hstep; + int size; +} p; + +void main() +{ + const int gx = int(gl_GlobalInvocationID.x); + const int i = gx * 4; + + if (i >= p.size) + return; + + const float descale = B_descale_data[0]; + const float scale = descale == 0.f ? 1.f : 1.f / descale; + + ivec4 v = ivec4(0); + for (int lane = 0; lane < 4; lane++) + { + const int i1 = i + lane; + if (i1 >= p.size) + break; + + const int j = i1 / p.K; + const int k = i1 - j * p.K; + const int bi = transB == 1 ? j * p.B_hstep + k : k * p.B_hstep + j; + const float vf = float(buffer_ld1(B_blob_data, bi)) * scale; + v[lane] = float2int8(vf); + } + + if (i + 3 < p.size) + { + i8buffer_st4(B_int8_data_pack4, gx, v); + } + else + { + i8buffer_st1(B_int8_data, i + 0, v.r); + if (i + 1 < p.size) i8buffer_st1(B_int8_data, i + 1, v.g); + if (i + 2 < p.size) i8buffer_st1(B_int8_data, i + 2, v.b); + } +} diff --git a/src/layer/vulkan/shader/innerproduct_gemm_int8.comp b/src/layer/vulkan/shader/innerproduct_gemm_int8.comp new file mode 100644 index 000000000000..90cc11673e6e --- /dev/null +++ b/src/layer/vulkan/shader/innerproduct_gemm_int8.comp @@ -0,0 +1,415 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated +#extension GL_EXT_integer_dot_product : require +#endif + +#extension GL_GOOGLE_include_directive : enable +#include "vulkan_activation.comp" + +#define LOCAL_MEMORY_UNROLL_INCH 8 + +layout(constant_id = 0) const int bias_term = 0; +layout(constant_id = 1) const int activation_type = 0; +layout(constant_id = 2) const float activation_param_0 = 0; +layout(constant_id = 3) const float activation_param_1 = 0; +layout(constant_id = 4) const int input_elempack = 1; +layout(constant_id = 5) const int weight_data_stride = 0; + +#define shape_constant_id_offset 6 +layout(constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout(constant_id = shape_constant_id_offset + 1) const int w = 0; +layout(constant_id = shape_constant_id_offset + 2) const int h = 0; +layout(constant_id = shape_constant_id_offset + 3) const int c = 0; +layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout(constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout(constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout(constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout(binding = 0) readonly buffer bottom_blob_1 { sint8 bottom_blob_int8_data_1[]; }; +layout(binding = 1) writeonly buffer top_blob_1 { sfp top_blob_data_1[]; }; +layout(binding = 2) readonly buffer weight_blob { sint8vec4 weight_data[]; }; +layout(binding = 3) readonly buffer weight_descales_blob { sfpvec4 weight_descales_data[]; }; +layout(binding = 4) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +layout(binding = 5) writeonly buffer top_blob_4 { sfpvec4 top_blob_data_4[]; }; +layout(binding = 6) readonly buffer bottom_blob_4 { sint8vec4 bottom_blob_int8_data_4[]; }; + +layout(push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +#if NCNN_shader_local_memory +// avoid bank conflict +#define PAD 1 +shared int tmp_v[8][LOCAL_MEMORY_UNROLL_INCH + PAD]; +shared int tmp_k[8][LOCAL_MEMORY_UNROLL_INCH + PAD]; +#endif + +void main() +{ + const uint gx = gl_GlobalInvocationID.x; + const uint gy = gl_GlobalInvocationID.y; + const uint gz = gl_GlobalInvocationID.z; + +#if !NCNN_shader_local_memory + if (gx * 4 >= psc(outw) || gy * 4 >= psc(outh) || gz >= 1) + return; +#else + if (gz >= 1) + return; +#endif + + const int elempack = input_elempack == 0 ? psc(c) : input_elempack; + const int output_elempack = elempack; + + ivec4 sum0 = ivec4(0); + ivec4 sum1 = ivec4(0); + ivec4 sum2 = ivec4(0); + ivec4 sum3 = ivec4(0); + +#if NCNN_shader_local_memory + const uint lx = gl_LocalInvocationID.x; + const uint ly = gl_LocalInvocationID.y; + + for (int i = 0; i < weight_data_stride * 4; i += LOCAL_MEMORY_UNROLL_INCH) + { + { + int v = 0; + const int vk = i + int(lx / 4) * 4; + const uint vy = gy * 4 + lx % 4; + if (vy < psc(outh) && vk < psc(w)) + { + if (elempack == 4) + { + ivec4 v4 = ivec4(0); + const int vy4 = int(vy) / 4; + const int vlane = int(vy) % 4; + const int vi = vy4 * psc(cstep) + vk; + v4.r = i8buffer_ld4(bottom_blob_int8_data_4, vi + 0)[vlane]; + if (vk + 1 < psc(w)) v4.g = i8buffer_ld4(bottom_blob_int8_data_4, vi + 1)[vlane]; + if (vk + 2 < psc(w)) v4.b = i8buffer_ld4(bottom_blob_int8_data_4, vi + 2)[vlane]; + if (vk + 3 < psc(w)) v4.a = i8buffer_ld4(bottom_blob_int8_data_4, vi + 3)[vlane]; + + v = packInt4x8(v4); + } + else // elempack == 1 + { + ivec4 v4 = ivec4(0); + const int vi = int(vy) * psc(cstep) + vk; + v4.r = i8buffer_ld1(bottom_blob_int8_data_1, vi + 0); + if (vk + 1 < psc(w)) v4.g = i8buffer_ld1(bottom_blob_int8_data_1, vi + 1); + if (vk + 2 < psc(w)) v4.b = i8buffer_ld1(bottom_blob_int8_data_1, vi + 2); + if (vk + 3 < psc(w)) v4.a = i8buffer_ld1(bottom_blob_int8_data_1, vi + 3); + + v = packInt4x8(v4); + } + } + + tmp_v[ly][lx] = v; + + const int kk = i + int(ly / 4) * 4; + const int w_offset = int(gx) * weight_data_stride + kk / 4; + const int k = i8buffer_sm4(weight_data, w_offset * 4 + int(ly % 4)); + + tmp_k[lx][ly] = k; + } + + barrier(); + + for (int k4 = 0; k4 < LOCAL_MEMORY_UNROLL_INCH / 4; k4++) + { + const int kk = k4 * 4; + +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated + const int v0 = tmp_v[ly][kk + 0]; + const int v1 = tmp_v[ly][kk + 1]; + const int v2 = tmp_v[ly][kk + 2]; + const int v3 = tmp_v[ly][kk + 3]; + + const int k0 = tmp_k[lx][kk + 0]; + const int k1 = tmp_k[lx][kk + 1]; + const int k2 = tmp_k[lx][kk + 2]; + const int k3 = tmp_k[lx][kk + 3]; + + sum0 += ivec4(dotPacked4x8EXT(v0, k0), dotPacked4x8EXT(v0, k1), dotPacked4x8EXT(v0, k2), dotPacked4x8EXT(v0, k3)); + sum1 += ivec4(dotPacked4x8EXT(v1, k0), dotPacked4x8EXT(v1, k1), dotPacked4x8EXT(v1, k2), dotPacked4x8EXT(v1, k3)); + sum2 += ivec4(dotPacked4x8EXT(v2, k0), dotPacked4x8EXT(v2, k1), dotPacked4x8EXT(v2, k2), dotPacked4x8EXT(v2, k3)); + sum3 += ivec4(dotPacked4x8EXT(v3, k0), dotPacked4x8EXT(v3, k1), dotPacked4x8EXT(v3, k2), dotPacked4x8EXT(v3, k3)); +#else + const ivec4 v0 = unpackInt4x8(tmp_v[ly][kk + 0]); + const ivec4 v1 = unpackInt4x8(tmp_v[ly][kk + 1]); + const ivec4 v2 = unpackInt4x8(tmp_v[ly][kk + 2]); + const ivec4 v3 = unpackInt4x8(tmp_v[ly][kk + 3]); + + const ivec4 k0 = unpackInt4x8(tmp_k[lx][kk + 0]); + const ivec4 k1 = unpackInt4x8(tmp_k[lx][kk + 1]); + const ivec4 k2 = unpackInt4x8(tmp_k[lx][kk + 2]); + const ivec4 k3 = unpackInt4x8(tmp_k[lx][kk + 3]); + + sum0.r += v0.r * k0.r + v0.g * k0.g + v0.b * k0.b + v0.a * k0.a; + sum0.g += v0.r * k1.r + v0.g * k1.g + v0.b * k1.b + v0.a * k1.a; + sum0.b += v0.r * k2.r + v0.g * k2.g + v0.b * k2.b + v0.a * k2.a; + sum0.a += v0.r * k3.r + v0.g * k3.g + v0.b * k3.b + v0.a * k3.a; + sum1.r += v1.r * k0.r + v1.g * k0.g + v1.b * k0.b + v1.a * k0.a; + sum1.g += v1.r * k1.r + v1.g * k1.g + v1.b * k1.b + v1.a * k1.a; + sum1.b += v1.r * k2.r + v1.g * k2.g + v1.b * k2.b + v1.a * k2.a; + sum1.a += v1.r * k3.r + v1.g * k3.g + v1.b * k3.b + v1.a * k3.a; + sum2.r += v2.r * k0.r + v2.g * k0.g + v2.b * k0.b + v2.a * k0.a; + sum2.g += v2.r * k1.r + v2.g * k1.g + v2.b * k1.b + v2.a * k1.a; + sum2.b += v2.r * k2.r + v2.g * k2.g + v2.b * k2.b + v2.a * k2.a; + sum2.a += v2.r * k3.r + v2.g * k3.g + v2.b * k3.b + v2.a * k3.a; + sum3.r += v3.r * k0.r + v3.g * k0.g + v3.b * k0.b + v3.a * k0.a; + sum3.g += v3.r * k1.r + v3.g * k1.g + v3.b * k1.b + v3.a * k1.a; + sum3.b += v3.r * k2.r + v3.g * k2.g + v3.b * k2.b + v3.a * k2.a; + sum3.a += v3.r * k3.r + v3.g * k3.g + v3.b * k3.b + v3.a * k3.a; +#endif + } + + barrier(); + } + + if (gx * 4 >= psc(outw) || gy * 4 >= psc(outh)) + return; +#else + for (int i = 0; i < weight_data_stride * 4; i += 4) + { + const uvec4 gy4 = gy * 4 + uvec4(0, 1, 2, 3); + const uvec4 gx4 = gx * 4 + uvec4(0, 1, 2, 3); + + int v0 = 0; + int v1 = 0; + int v2 = 0; + int v3 = 0; + + if (i < psc(w)) + { + if (elempack == 4) + { + if (gy4.r < psc(outh)) + { + ivec4 v = ivec4(0); + const int vy4 = int(gy4.r) / 4; + const int vlane = int(gy4.r) % 4; + const int vi = vy4 * psc(cstep) + i; + v.r = i8buffer_ld4(bottom_blob_int8_data_4, vi + 0)[vlane]; + if (i + 1 < psc(w)) v.g = i8buffer_ld4(bottom_blob_int8_data_4, vi + 1)[vlane]; + if (i + 2 < psc(w)) v.b = i8buffer_ld4(bottom_blob_int8_data_4, vi + 2)[vlane]; + if (i + 3 < psc(w)) v.a = i8buffer_ld4(bottom_blob_int8_data_4, vi + 3)[vlane]; + + v0 = packInt4x8(v); + } + if (gy4.g < psc(outh)) + { + ivec4 v = ivec4(0); + const int vy4 = int(gy4.g) / 4; + const int vlane = int(gy4.g) % 4; + const int vi = vy4 * psc(cstep) + i; + v.r = i8buffer_ld4(bottom_blob_int8_data_4, vi + 0)[vlane]; + if (i + 1 < psc(w)) v.g = i8buffer_ld4(bottom_blob_int8_data_4, vi + 1)[vlane]; + if (i + 2 < psc(w)) v.b = i8buffer_ld4(bottom_blob_int8_data_4, vi + 2)[vlane]; + if (i + 3 < psc(w)) v.a = i8buffer_ld4(bottom_blob_int8_data_4, vi + 3)[vlane]; + + v1 = packInt4x8(v); + } + if (gy4.b < psc(outh)) + { + ivec4 v = ivec4(0); + const int vy4 = int(gy4.b) / 4; + const int vlane = int(gy4.b) % 4; + const int vi = vy4 * psc(cstep) + i; + v.r = i8buffer_ld4(bottom_blob_int8_data_4, vi + 0)[vlane]; + if (i + 1 < psc(w)) v.g = i8buffer_ld4(bottom_blob_int8_data_4, vi + 1)[vlane]; + if (i + 2 < psc(w)) v.b = i8buffer_ld4(bottom_blob_int8_data_4, vi + 2)[vlane]; + if (i + 3 < psc(w)) v.a = i8buffer_ld4(bottom_blob_int8_data_4, vi + 3)[vlane]; + + v2 = packInt4x8(v); + } + if (gy4.a < psc(outh)) + { + ivec4 v = ivec4(0); + const int vy4 = int(gy4.a) / 4; + const int vlane = int(gy4.a) % 4; + const int vi = vy4 * psc(cstep) + i; + v.r = i8buffer_ld4(bottom_blob_int8_data_4, vi + 0)[vlane]; + if (i + 1 < psc(w)) v.g = i8buffer_ld4(bottom_blob_int8_data_4, vi + 1)[vlane]; + if (i + 2 < psc(w)) v.b = i8buffer_ld4(bottom_blob_int8_data_4, vi + 2)[vlane]; + if (i + 3 < psc(w)) v.a = i8buffer_ld4(bottom_blob_int8_data_4, vi + 3)[vlane]; + + v3 = packInt4x8(v); + } + } + else // elempack == 1 + { + if (gy4.r < psc(outh)) + { + ivec4 v = ivec4(0); + const int vi = int(gy4.r) * psc(cstep) + i; + v.r = i8buffer_ld1(bottom_blob_int8_data_1, vi + 0); + if (i + 1 < psc(w)) v.g = i8buffer_ld1(bottom_blob_int8_data_1, vi + 1); + if (i + 2 < psc(w)) v.b = i8buffer_ld1(bottom_blob_int8_data_1, vi + 2); + if (i + 3 < psc(w)) v.a = i8buffer_ld1(bottom_blob_int8_data_1, vi + 3); + + v0 = packInt4x8(v); + } + if (gy4.g < psc(outh)) + { + ivec4 v = ivec4(0); + const int vi = int(gy4.g) * psc(cstep) + i; + v.r = i8buffer_ld1(bottom_blob_int8_data_1, vi + 0); + if (i + 1 < psc(w)) v.g = i8buffer_ld1(bottom_blob_int8_data_1, vi + 1); + if (i + 2 < psc(w)) v.b = i8buffer_ld1(bottom_blob_int8_data_1, vi + 2); + if (i + 3 < psc(w)) v.a = i8buffer_ld1(bottom_blob_int8_data_1, vi + 3); + + v1 = packInt4x8(v); + } + if (gy4.b < psc(outh)) + { + ivec4 v = ivec4(0); + const int vi = int(gy4.b) * psc(cstep) + i; + v.r = i8buffer_ld1(bottom_blob_int8_data_1, vi + 0); + if (i + 1 < psc(w)) v.g = i8buffer_ld1(bottom_blob_int8_data_1, vi + 1); + if (i + 2 < psc(w)) v.b = i8buffer_ld1(bottom_blob_int8_data_1, vi + 2); + if (i + 3 < psc(w)) v.a = i8buffer_ld1(bottom_blob_int8_data_1, vi + 3); + + v2 = packInt4x8(v); + } + if (gy4.a < psc(outh)) + { + ivec4 v = ivec4(0); + const int vi = int(gy4.a) * psc(cstep) + i; + v.r = i8buffer_ld1(bottom_blob_int8_data_1, vi + 0); + if (i + 1 < psc(w)) v.g = i8buffer_ld1(bottom_blob_int8_data_1, vi + 1); + if (i + 2 < psc(w)) v.b = i8buffer_ld1(bottom_blob_int8_data_1, vi + 2); + if (i + 3 < psc(w)) v.a = i8buffer_ld1(bottom_blob_int8_data_1, vi + 3); + + v3 = packInt4x8(v); + } + } + } + + const int wi = i / 4; + const int w_offset = int(gx) * weight_data_stride + wi; + + const int k0 = i8buffer_sm4(weight_data, w_offset * 4 + 0); + const int k1 = i8buffer_sm4(weight_data, w_offset * 4 + 1); + const int k2 = i8buffer_sm4(weight_data, w_offset * 4 + 2); + const int k3 = i8buffer_sm4(weight_data, w_offset * 4 + 3); + +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated + sum0 += ivec4(dotPacked4x8EXT(v0, k0), dotPacked4x8EXT(v0, k1), dotPacked4x8EXT(v0, k2), dotPacked4x8EXT(v0, k3)); + sum1 += ivec4(dotPacked4x8EXT(v1, k0), dotPacked4x8EXT(v1, k1), dotPacked4x8EXT(v1, k2), dotPacked4x8EXT(v1, k3)); + sum2 += ivec4(dotPacked4x8EXT(v2, k0), dotPacked4x8EXT(v2, k1), dotPacked4x8EXT(v2, k2), dotPacked4x8EXT(v2, k3)); + sum3 += ivec4(dotPacked4x8EXT(v3, k0), dotPacked4x8EXT(v3, k1), dotPacked4x8EXT(v3, k2), dotPacked4x8EXT(v3, k3)); +#else + const ivec4 v0v = unpackInt4x8(v0); + const ivec4 v1v = unpackInt4x8(v1); + const ivec4 v2v = unpackInt4x8(v2); + const ivec4 v3v = unpackInt4x8(v3); + + const ivec4 k0v = unpackInt4x8(k0); + const ivec4 k1v = unpackInt4x8(k1); + const ivec4 k2v = unpackInt4x8(k2); + const ivec4 k3v = unpackInt4x8(k3); + + sum0.r += v0v.r * k0v.r + v0v.g * k0v.g + v0v.b * k0v.b + v0v.a * k0v.a; + sum0.g += v0v.r * k1v.r + v0v.g * k1v.g + v0v.b * k1v.b + v0v.a * k1v.a; + sum0.b += v0v.r * k2v.r + v0v.g * k2v.g + v0v.b * k2v.b + v0v.a * k2v.a; + sum0.a += v0v.r * k3v.r + v0v.g * k3v.g + v0v.b * k3v.b + v0v.a * k3v.a; + sum1.r += v1v.r * k0v.r + v1v.g * k0v.g + v1v.b * k0v.b + v1v.a * k0v.a; + sum1.g += v1v.r * k1v.r + v1v.g * k1v.g + v1v.b * k1v.b + v1v.a * k1v.a; + sum1.b += v1v.r * k2v.r + v1v.g * k2v.g + v1v.b * k2v.b + v1v.a * k2v.a; + sum1.a += v1v.r * k3v.r + v1v.g * k3v.g + v1v.b * k3v.b + v1v.a * k3v.a; + sum2.r += v2v.r * k0v.r + v2v.g * k0v.g + v2v.b * k0v.b + v2v.a * k0v.a; + sum2.g += v2v.r * k1v.r + v2v.g * k1v.g + v2v.b * k1v.b + v2v.a * k1v.a; + sum2.b += v2v.r * k2v.r + v2v.g * k2v.g + v2v.b * k2v.b + v2v.a * k2v.a; + sum2.a += v2v.r * k3v.r + v2v.g * k3v.g + v2v.b * k3v.b + v2v.a * k3v.a; + sum3.r += v3v.r * k0v.r + v3v.g * k0v.g + v3v.b * k0v.b + v3v.a * k0v.a; + sum3.g += v3v.r * k1v.r + v3v.g * k1v.g + v3v.b * k1v.b + v3v.a * k1v.a; + sum3.b += v3v.r * k2v.r + v3v.g * k2v.g + v3v.b * k2v.b + v3v.a * k2v.a; + sum3.a += v3v.r * k3v.r + v3v.g * k3v.g + v3v.b * k3v.b + v3v.a * k3v.a; +#endif + } +#endif + + const uvec4 gx4 = gx * 4 + uvec4(0, 1, 2, 3); + const uvec4 gy4 = gy * 4 + uvec4(0, 1, 2, 3); + + const vec4 descale = vec4(buffer_ld4(weight_descales_data, gx)); + + vec4 sumfp0 = vec4(sum0) * descale; + vec4 sumfp1 = vec4(sum1) * descale; + vec4 sumfp2 = vec4(sum2) * descale; + vec4 sumfp3 = vec4(sum3) * descale; + + if (bias_term == 1) + { + vec4 bias = vec4(buffer_ld4(bias_data, gx)); + + sumfp0 += bias; + sumfp1 += bias; + sumfp2 += bias; + sumfp3 += bias; + } + + afpvec4 out0 = activation_afpvec4(afpvec4(sumfp0), activation_type, activation_param_0, activation_param_1); + afpvec4 out1 = activation_afpvec4(afpvec4(sumfp1), activation_type, activation_param_0, activation_param_1); + afpvec4 out2 = activation_afpvec4(afpvec4(sumfp2), activation_type, activation_param_0, activation_param_1); + afpvec4 out3 = activation_afpvec4(afpvec4(sumfp3), activation_type, activation_param_0, activation_param_1); + + if (output_elempack == 4) + { + const uint gi = gy * uint(psc(outw)) + gx * 4; + + buffer_st4(top_blob_data_4, gi + 0, afpvec4(out0.r, out1.r, out2.r, out3.r)); + if (gx4.g < psc(outw)) buffer_st4(top_blob_data_4, gi + 1, afpvec4(out0.g, out1.g, out2.g, out3.g)); + if (gx4.b < psc(outw)) buffer_st4(top_blob_data_4, gi + 2, afpvec4(out0.b, out1.b, out2.b, out3.b)); + if (gx4.a < psc(outw)) buffer_st4(top_blob_data_4, gi + 3, afpvec4(out0.a, out1.a, out2.a, out3.a)); + + return; + } + + const uvec4 gi4 = gy4 * uint(psc(outw)) + gx * 4; + + buffer_st1(top_blob_data_1, gi4.r, out0.r); + if (gx4.g < psc(outw)) buffer_st1(top_blob_data_1, gi4.r + 1, out0.g); + if (gx4.b < psc(outw)) buffer_st1(top_blob_data_1, gi4.r + 2, out0.b); + if (gx4.a < psc(outw)) buffer_st1(top_blob_data_1, gi4.r + 3, out0.a); + if (gy4.g < psc(outh)) + { + buffer_st1(top_blob_data_1, gi4.g, out1.r); + if (gx4.g < psc(outw)) buffer_st1(top_blob_data_1, gi4.g + 1, out1.g); + if (gx4.b < psc(outw)) buffer_st1(top_blob_data_1, gi4.g + 2, out1.b); + if (gx4.a < psc(outw)) buffer_st1(top_blob_data_1, gi4.g + 3, out1.a); + } + if (gy4.b < psc(outh)) + { + buffer_st1(top_blob_data_1, gi4.b, out2.r); + if (gx4.g < psc(outw)) buffer_st1(top_blob_data_1, gi4.b + 1, out2.g); + if (gx4.b < psc(outw)) buffer_st1(top_blob_data_1, gi4.b + 2, out2.b); + if (gx4.a < psc(outw)) buffer_st1(top_blob_data_1, gi4.b + 3, out2.a); + } + if (gy4.a < psc(outh)) + { + buffer_st1(top_blob_data_1, gi4.a, out3.r); + if (gx4.g < psc(outw)) buffer_st1(top_blob_data_1, gi4.a + 1, out3.g); + if (gx4.b < psc(outw)) buffer_st1(top_blob_data_1, gi4.a + 2, out3.b); + if (gx4.a < psc(outw)) buffer_st1(top_blob_data_1, gi4.a + 3, out3.a); + } +} diff --git a/src/layer/vulkan/shader/innerproduct_int8.comp b/src/layer/vulkan/shader/innerproduct_int8.comp new file mode 100644 index 000000000000..0bbd791c01f7 --- /dev/null +++ b/src/layer/vulkan/shader/innerproduct_int8.comp @@ -0,0 +1,87 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated +#extension GL_EXT_integer_dot_product : require +#endif + +#extension GL_GOOGLE_include_directive : enable +#include "vulkan_activation.comp" + +layout(constant_id = 0) const int bias_term = 0; +layout(constant_id = 1) const int activation_type = 0; +layout(constant_id = 2) const float activation_param_0 = 0; +layout(constant_id = 3) const float activation_param_1 = 0; +layout(constant_id = 4) const int weight_data_stride = 0; + +#define shape_constant_id_offset 5 +layout(constant_id = shape_constant_id_offset + 0) const int w = 0; +layout(constant_id = shape_constant_id_offset + 1) const int outw = 0; + +layout(binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_int8_data[]; }; +layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout(binding = 2) readonly buffer weight_blob { sint8vec4 weight_data[]; }; +layout(binding = 3) readonly buffer weight_descales_blob { sfpvec4 weight_descales_data[]; }; +layout(binding = 4) readonly buffer bias_blob { sfpvec4 bias_data[]; }; + +layout(push_constant) uniform parameter +{ + int w; + int outw; +} p; + +void main() +{ + const int gx = int(gl_GlobalInvocationID.x) * 4; + const int gy = int(gl_GlobalInvocationID.y); + const int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= 1 || gz >= 1) + return; + + ivec4 sum = ivec4(0); + + for (int i = 0; i < psc(w); i += 4) + { + const int v4 = i8buffer_sm4(bottom_blob_int8_data, i / 4); + const int wi = i / 4; + const int w_offset = gx / 4 * weight_data_stride + wi; + + const int k0 = i8buffer_sm4(weight_data, w_offset * 4 + 0); + const int k1 = i8buffer_sm4(weight_data, w_offset * 4 + 1); + const int k2 = i8buffer_sm4(weight_data, w_offset * 4 + 2); + const int k3 = i8buffer_sm4(weight_data, w_offset * 4 + 3); + +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated + sum += ivec4(dotPacked4x8EXT(v4, k0), dotPacked4x8EXT(v4, k1), dotPacked4x8EXT(v4, k2), dotPacked4x8EXT(v4, k3)); +#else + const ivec4 v0 = unpackInt4x8(v4); + const ivec4 k0v = unpackInt4x8(k0); + const ivec4 k1v = unpackInt4x8(k1); + const ivec4 k2v = unpackInt4x8(k2); + const ivec4 k3v = unpackInt4x8(k3); + + sum += ivec4(v0.r * k0v.r + v0.g * k0v.g + v0.b * k0v.b + v0.a * k0v.a, + v0.r * k1v.r + v0.g * k1v.g + v0.b * k1v.b + v0.a * k1v.a, + v0.r * k2v.r + v0.g * k2v.g + v0.b * k2v.b + v0.a * k2v.a, + v0.r * k3v.r + v0.g * k3v.g + v0.b * k3v.b + v0.a * k3v.a); +#endif + } + + const vec4 descale = vec4(buffer_ld4(weight_descales_data, gx / 4)); + + afpvec4 sumfp32 = afpvec4(vec4(sum) * descale); + + if (bias_term == 1) + { + vec4 bias = vec4(buffer_ld4(bias_data, gx / 4)); + + sumfp32 += afpvec4(bias); + } + + sumfp32 = activation_afpvec4(sumfp32, activation_type, activation_param_0, activation_param_1); + + buffer_st4(top_blob_data, gx / 4, sumfp32); +} diff --git a/src/layer/vulkan/shader/innerproduct_reduce_sum8_int8.comp b/src/layer/vulkan/shader/innerproduct_reduce_sum8_int8.comp new file mode 100644 index 000000000000..41f70a653503 --- /dev/null +++ b/src/layer/vulkan/shader/innerproduct_reduce_sum8_int8.comp @@ -0,0 +1,69 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#extension GL_GOOGLE_include_directive : enable +#include "vulkan_activation.comp" + +layout(constant_id = 0) const int bias_term = 0; +layout(constant_id = 1) const int activation_type = 0; +layout(constant_id = 2) const float activation_param_0 = 0; +layout(constant_id = 3) const float activation_param_1 = 0; + +#define shape_constant_id_offset 4 +layout(constant_id = shape_constant_id_offset + 0) const int w = 0; +layout(constant_id = shape_constant_id_offset + 1) const int h = 0; + +layout(constant_id = shape_constant_id_offset + 2) const int outw = 0; + +layout(binding = 0) readonly buffer bottom_blob { ivec4 bottom_blob_data[]; }; +layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout(binding = 2) readonly buffer weight_descales_blob { sfpvec4 weight_descales_data[]; }; +layout(binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; + +layout(push_constant) uniform parameter +{ + int w; + int h; + + int outw; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= 1 || gz >= 1) + return; + + afpvec4 sumfp32; + + if (bias_term == 1) + { + sumfp32 = buffer_ld4(bias_data, gx); + } + else + { + sumfp32 = afpvec4(0.f); + } + + int v_offset = gx * psc(w); + + ivec4 sum = ivec4(0); + + for (int i = 0; i < psc(w); i++) + { + sum += bottom_blob_data[v_offset + i]; + } + + const vec4 descale = vec4(buffer_ld4(weight_descales_data, gx)); + + sumfp32 += afpvec4(vec4(sum) * descale); + + sumfp32 = activation_afpvec4(sumfp32, activation_type, activation_param_0, activation_param_1); + + buffer_st4(top_blob_data, gx, sumfp32); +} diff --git a/src/layer/vulkan/shader/innerproduct_sum8_int8.comp b/src/layer/vulkan/shader/innerproduct_sum8_int8.comp new file mode 100644 index 000000000000..d6d84378d68c --- /dev/null +++ b/src/layer/vulkan/shader/innerproduct_sum8_int8.comp @@ -0,0 +1,75 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated +#extension GL_EXT_integer_dot_product : require +#endif + +layout(constant_id = 0) const int weight_data_stride = 0; + +#define shape_constant_id_offset 1 +layout(constant_id = shape_constant_id_offset + 0) const int w = 0; + +layout(constant_id = shape_constant_id_offset + 1) const int outw = 0; +layout(constant_id = shape_constant_id_offset + 2) const int outh = 0; + +layout(binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_int8_data[]; }; +layout(binding = 1) writeonly buffer top_blob { ivec4 top_blob_data[]; }; +layout(binding = 2) readonly buffer weight_blob { ivec4 weight_data[]; }; + +layout(push_constant) uniform parameter +{ + int w; + + int outw; + int outh; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= 1) + return; + + const int start = gx * 8; + const int end = min(gx * 8 + 8, (psc(w) + 3) / 4); + + ivec4 sum = ivec4(0); + + int w_offset = gy * weight_data_stride + start; + + for (int i = start; i < end; i++) + { + const int v4 = i8buffer_sm4(bottom_blob_int8_data, i); + + const ivec4 k = weight_data[w_offset]; + +#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated + sum.r += dotPacked4x8EXT(v4, k.x); + sum.g += dotPacked4x8EXT(v4, k.y); + sum.b += dotPacked4x8EXT(v4, k.z); + sum.a += dotPacked4x8EXT(v4, k.w); +#else + const ivec4 v0 = unpackInt4x8(v4); + const ivec4 k0v = unpackInt4x8(k.x); + const ivec4 k1v = unpackInt4x8(k.y); + const ivec4 k2v = unpackInt4x8(k.z); + const ivec4 k3v = unpackInt4x8(k.w); + + sum.r += v0.r * k0v.r + v0.g * k0v.g + v0.b * k0v.b + v0.a * k0v.a; + sum.g += v0.r * k1v.r + v0.g * k1v.g + v0.b * k1v.b + v0.a * k1v.a; + sum.b += v0.r * k2v.r + v0.g * k2v.g + v0.b * k2v.b + v0.a * k2v.a; + sum.a += v0.r * k3v.r + v0.g * k3v.g + v0.b * k3v.b + v0.a * k3v.a; +#endif + + w_offset += 1; + } + + const int gi = gy * psc(outw) + gx; + top_blob_data[gi] = sum; +} diff --git a/src/layer/vulkan/shader/packing_pack1to4_int8.comp b/src/layer/vulkan/shader/packing_pack1to4_int8.comp index 354094ec6e80..4119a2d28832 100644 --- a/src/layer/vulkan/shader/packing_pack1to4_int8.comp +++ b/src/layer/vulkan/shader/packing_pack1to4_int8.comp @@ -35,12 +35,6 @@ void main() const uint gi = gy * psc(n) + gx; - // if (cast_type_from == cast_type_to) - // { - // i8buffer_cp1to4(top_blob_data, gi, bottom_blob_data, gi4); - // return; - // } - ivec4 v; if (cast_type_from == 3) { @@ -51,10 +45,18 @@ void main() } else { - v.r = i8buffer_ld1(bottom_blob_data, gi4.r); - v.g = i8buffer_ld1(bottom_blob_data, gi4.g); - v.b = i8buffer_ld1(bottom_blob_data, gi4.b); - v.a = i8buffer_ld1(bottom_blob_data, gi4.a); + if (cast_type_to == 3) + { + v.r = i8buffer_ld1(bottom_blob_data, gi4.r); + v.g = i8buffer_ld1(bottom_blob_data, gi4.g); + v.b = i8buffer_ld1(bottom_blob_data, gi4.b); + v.a = i8buffer_ld1(bottom_blob_data, gi4.a); + } + else + { + i8buffer_cp1to4(top_blob_data, gi, bottom_blob_data, ivec4(gi4)); + return; + } } if (cast_type_to == 3) diff --git a/src/layer/vulkan/shader/packing_pack4to1_int8.comp b/src/layer/vulkan/shader/packing_pack4to1_int8.comp index c8b5b523b3e2..dd51546d1aa5 100644 --- a/src/layer/vulkan/shader/packing_pack4to1_int8.comp +++ b/src/layer/vulkan/shader/packing_pack4to1_int8.comp @@ -35,12 +35,6 @@ void main() const uvec4 gi4 = (gy * 4 + uvec4(0, 1, 2, 3)) * psc(stride) + gx; - // if (cast_type_from == cast_type_to) - // { - // buffer_cp4to1(top_blob_data, gi4, bottom_blob_data, gi); - // return; - // } - ivec4 v; if (cast_type_from == 3) { @@ -48,7 +42,15 @@ void main() } else { - v = i8buffer_ld4(bottom_blob_data, gi); + if (cast_type_to == 3) + { + v = i8buffer_ld4(bottom_blob_data, gi); + } + else + { + i8buffer_cp4to1(top_blob_data, ivec4(gi4), bottom_blob_data, int(gi)); + return; + } } if (cast_type_to == 3) diff --git a/src/layer/vulkan/shader/padding_3d_int8.comp b/src/layer/vulkan/shader/padding_3d_int8.comp new file mode 100644 index 000000000000..cd2c31609c67 --- /dev/null +++ b/src/layer/vulkan/shader/padding_3d_int8.comp @@ -0,0 +1,116 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout(constant_id = 0) const int type = 1; +layout(constant_id = 1) const float value = 0; +layout(constant_id = 2) const int per_channel_pad = 0; + +#define shape_constant_id_offset 3 +layout(constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout(constant_id = shape_constant_id_offset + 1) const int w = 0; +layout(constant_id = shape_constant_id_offset + 2) const int h = 0; +layout(constant_id = shape_constant_id_offset + 3) const int d = 0; +layout(constant_id = shape_constant_id_offset + 4) const int c = 0; +layout(constant_id = shape_constant_id_offset + 5) const int cstep = 0; + +layout(constant_id = shape_constant_id_offset + 6) const int outdims = 0; +layout(constant_id = shape_constant_id_offset + 7) const int outw = 0; +layout(constant_id = shape_constant_id_offset + 8) const int outh = 0; +layout(constant_id = shape_constant_id_offset + 9) const int outd = 0; +layout(constant_id = shape_constant_id_offset + 10) const int outc = 0; +layout(constant_id = shape_constant_id_offset + 11) const int outcstep = 0; + +layout(binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_data[]; }; +layout(binding = 1) writeonly buffer top_blob { sint8 top_blob_data[]; }; +layout(binding = 2) readonly buffer per_channel_pad_blob { sint8 per_channel_pad_blob_data[]; }; + +layout(push_constant) uniform parameter +{ + int dims; + int w; + int h; + int d; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outd; + int outc; + int outcstep; + + int left; + int top; + int front; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) * psc(outd) || gz >= psc(outc)) + return; + + const int pad_value = int(value); + + // if (psc(dims) == 4) + { + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + int x = gx - p.left; + int yh = gy % psc(outh) - p.top; + int yd = gy / psc(outh) - p.front; + int y = yd * psc(h) + yh; + int z = gz; + + if (type == 0) + { + if (x >= 0 && x < psc(w) && yh >= 0 && yh < psc(h) && yd >= 0 && yd < psc(d) && z >= 0 && z < psc(c)) + { + int v_offset = z * psc(cstep) + y * psc(w) + x; + i8buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset); + } + else if (per_channel_pad == 1) + { + i8buffer_cp1(top_blob_data, gi, per_channel_pad_blob_data, gz); + } + else + { + i8buffer_st1(top_blob_data, gi, pad_value); + } + } + if (type == 1) + { + x = clamp(x, 0, psc(w) - 1); + yh = clamp(yh, 0, psc(h) - 1); + yd = clamp(yd, 0, psc(d) - 1); + y = yd * psc(h) + yh; + + int v_offset = z * psc(cstep) + y * psc(w) + x; + i8buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset); + } + if (type == 2) + { + x = abs(x); + yh = abs(yh); + yd = abs(yd); + // NOTE psc(X) get zeros on nvidia + // TODO only enable this workaround for some nvidia driver + x = (p.w - 1) - abs(x - (p.w - 1)); + yh = (p.h - 1) - abs(yh - (p.h - 1)); + yd = (p.d - 1) - abs(yd - (p.d - 1)); + // x = (psc(w) - 1) - abs(x - (psc(w) - 1)); + // yh = (psc(h) - 1) - abs(yh - (psc(h) - 1)); + // yd = (psc(d) - 1) - abs(yd - (psc(d) - 1)); + y = yd * psc(h) + yh; + + int v_offset = z * psc(cstep) + y * psc(w) + x; + i8buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset); + } + } +} diff --git a/src/layer/vulkan/shader/padding_3d_pack4_int8.comp b/src/layer/vulkan/shader/padding_3d_pack4_int8.comp new file mode 100644 index 000000000000..149f221f6ad3 --- /dev/null +++ b/src/layer/vulkan/shader/padding_3d_pack4_int8.comp @@ -0,0 +1,116 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout(constant_id = 0) const int type = 1; +layout(constant_id = 1) const float value = 0; +layout(constant_id = 2) const int per_channel_pad = 0; + +#define shape_constant_id_offset 3 +layout(constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout(constant_id = shape_constant_id_offset + 1) const int w = 0; +layout(constant_id = shape_constant_id_offset + 2) const int h = 0; +layout(constant_id = shape_constant_id_offset + 3) const int d = 0; +layout(constant_id = shape_constant_id_offset + 4) const int c = 0; +layout(constant_id = shape_constant_id_offset + 5) const int cstep = 0; + +layout(constant_id = shape_constant_id_offset + 6) const int outdims = 0; +layout(constant_id = shape_constant_id_offset + 7) const int outw = 0; +layout(constant_id = shape_constant_id_offset + 8) const int outh = 0; +layout(constant_id = shape_constant_id_offset + 9) const int outd = 0; +layout(constant_id = shape_constant_id_offset + 10) const int outc = 0; +layout(constant_id = shape_constant_id_offset + 11) const int outcstep = 0; + +layout(binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_data[]; }; +layout(binding = 1) writeonly buffer top_blob { sint8vec4 top_blob_data[]; }; +layout(binding = 2) readonly buffer per_channel_pad_blob { sint8vec4 per_channel_pad_blob_data[]; }; + +layout(push_constant) uniform parameter +{ + int dims; + int w; + int h; + int d; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outd; + int outc; + int outcstep; + + int left; + int top; + int front; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) * psc(outd) || gz >= psc(outc)) + return; + + const ivec4 pad_value = ivec4(int(value)); + + // if (psc(dims) == 4) + { + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + int x = gx - p.left; + int yh = gy % psc(outh) - p.top; + int yd = gy / psc(outh) - p.front; + int y = yd * psc(h) + yh; + int z = gz; + + if (type == 0) + { + if (x >= 0 && x < psc(w) && yh >= 0 && yh < psc(h) && yd >= 0 && yd < psc(d) && z >= 0 && z < psc(c)) + { + int v_offset = z * psc(cstep) + y * psc(w) + x; + i8buffer_cp4(top_blob_data, gi, bottom_blob_data, v_offset); + } + else if (per_channel_pad == 1) + { + i8buffer_cp4(top_blob_data, gi, per_channel_pad_blob_data, gz); + } + else + { + i8buffer_st4(top_blob_data, gi, pad_value); + } + } + if (type == 1) + { + x = clamp(x, 0, psc(w) - 1); + yh = clamp(yh, 0, psc(h) - 1); + yd = clamp(yd, 0, psc(d) - 1); + y = yd * psc(h) + yh; + + int v_offset = z * psc(cstep) + y * psc(w) + x; + i8buffer_cp4(top_blob_data, gi, bottom_blob_data, v_offset); + } + if (type == 2) + { + x = abs(x); + yh = abs(yh); + yd = abs(yd); + // NOTE psc(X) get zeros on nvidia + // TODO only enable this workaround for some nvidia driver + x = (p.w - 1) - abs(x - (p.w - 1)); + yh = (p.h - 1) - abs(yh - (p.h - 1)); + yd = (p.d - 1) - abs(yd - (p.d - 1)); + // x = (psc(w) - 1) - abs(x - (psc(w) - 1)); + // yh = (psc(h) - 1) - abs(yh - (psc(h) - 1)); + // yd = (psc(d) - 1) - abs(yd - (psc(d) - 1)); + y = yd * psc(h) + yh; + + int v_offset = z * psc(cstep) + y * psc(w) + x; + i8buffer_cp4(top_blob_data, gi, bottom_blob_data, v_offset); + } + } +} diff --git a/src/layer/vulkan/shader/padding_int8.comp b/src/layer/vulkan/shader/padding_int8.comp new file mode 100644 index 000000000000..ea195035fdaf --- /dev/null +++ b/src/layer/vulkan/shader/padding_int8.comp @@ -0,0 +1,182 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout(constant_id = 0) const int type = 1; +layout(constant_id = 1) const float value = 0; +layout(constant_id = 2) const int per_channel_pad = 0; + +#define shape_constant_id_offset 3 +layout(constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout(constant_id = shape_constant_id_offset + 1) const int w = 0; +layout(constant_id = shape_constant_id_offset + 2) const int h = 0; +layout(constant_id = shape_constant_id_offset + 3) const int c = 0; +layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout(constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout(constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout(constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout(binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_data[]; }; +layout(binding = 1) writeonly buffer top_blob { sint8 top_blob_data[]; }; +layout(binding = 2) readonly buffer per_channel_pad_blob { sint8 per_channel_pad_blob_data[]; }; + +layout(push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + int left; + int top; + int front; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + const int pad_value = int(value); + + if (psc(dims) == 1) + { + int x = gx - p.left; + + if (type == 0) + { + if (x >= 0 && x < psc(w)) + { + i8buffer_cp1(top_blob_data, gx, bottom_blob_data, x); + } + else + { + i8buffer_st1(top_blob_data, gx, pad_value); + } + } + if (type == 1) + { + x = clamp(x, 0, psc(w) - 1); + + i8buffer_cp1(top_blob_data, gx, bottom_blob_data, x); + } + if (type == 2) + { + x = abs(x); + // NOTE psc(X) get zeros on nvidia + // TODO only enable this workaround for some nvidia driver + x = (p.w - 1) - abs(x - (p.w - 1)); + // x = (psc(w) - 1) - abs(x - (psc(w) - 1)); + + i8buffer_cp1(top_blob_data, gx, bottom_blob_data, x); + } + } + else if (psc(dims) == 2) + { + const int gi = gy * psc(outw) + gx; + + int x = gx - p.left; + int y = gy - p.top; + + if (type == 0) + { + if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h)) + { + int v_offset = y * psc(w) + x; + i8buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset); + } + else + { + i8buffer_st1(top_blob_data, gi, pad_value); + } + } + if (type == 1) + { + x = clamp(x, 0, psc(w) - 1); + y = clamp(y, 0, psc(h) - 1); + + int v_offset = y * psc(w) + x; + i8buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset); + } + if (type == 2) + { + x = abs(x); + y = abs(y); + // NOTE psc(X) get zeros on nvidia + // TODO only enable this workaround for some nvidia driver + x = (p.w - 1) - abs(x - (p.w - 1)); + y = (p.h - 1) - abs(y - (p.h - 1)); + // x = (psc(w) - 1) - abs(x - (psc(w) - 1)); + // y = (psc(h) - 1) - abs(y - (psc(h) - 1)); + + int v_offset = y * psc(w) + x; + i8buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset); + } + } + else // if (psc(dims) == 3) + { + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + int x = gx - p.left; + int y = gy - p.top; + int z = gz - p.front; + + if (type == 0) + { + if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h) && z >= 0 && z < psc(c)) + { + int v_offset = z * psc(cstep) + y * psc(w) + x; + i8buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset); + } + else if (per_channel_pad == 1) + { + i8buffer_cp1(top_blob_data, gi, per_channel_pad_blob_data, gz); + } + else + { + i8buffer_st1(top_blob_data, gi, pad_value); + } + } + if (type == 1) + { + x = clamp(x, 0, psc(w) - 1); + y = clamp(y, 0, psc(h) - 1); + z = clamp(z, 0, psc(c) - 1); + + int v_offset = z * psc(cstep) + y * psc(w) + x; + i8buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset); + } + if (type == 2) + { + x = abs(x); + y = abs(y); + z = abs(z); + // NOTE psc(X) get zeros on nvidia + // TODO only enable this workaround for some nvidia driver + x = (p.w - 1) - abs(x - (p.w - 1)); + y = (p.h - 1) - abs(y - (p.h - 1)); + z = (p.c - 1) - abs(z - (p.c - 1)); + // x = (psc(w) - 1) - abs(x - (psc(w) - 1)); + // y = (psc(h) - 1) - abs(y - (psc(h) - 1)); + // z = (psc(c) - 1) - abs(z - (psc(c) - 1)); + + int v_offset = z * psc(cstep) + y * psc(w) + x; + i8buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset); + } + } +} diff --git a/src/layer/vulkan/shader/padding_pack1to4_int8.comp b/src/layer/vulkan/shader/padding_pack1to4_int8.comp new file mode 100644 index 000000000000..e3629d898815 --- /dev/null +++ b/src/layer/vulkan/shader/padding_pack1to4_int8.comp @@ -0,0 +1,185 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout(constant_id = 0) const int type = 1; +layout(constant_id = 1) const float value = 0; +layout(constant_id = 2) const int per_channel_pad = 0; + +#define shape_constant_id_offset 3 +layout(constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout(constant_id = shape_constant_id_offset + 1) const int w = 0; +layout(constant_id = shape_constant_id_offset + 2) const int h = 0; +layout(constant_id = shape_constant_id_offset + 3) const int c = 0; +layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout(constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout(constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout(constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout(binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_data[]; }; +layout(binding = 1) writeonly buffer top_blob { sint8vec4 top_blob_data[]; }; +layout(binding = 2) readonly buffer per_channel_pad_blob { sint8vec4 per_channel_pad_blob_data[]; }; + +layout(push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + int left; + int top; + int front; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + const ivec4 pad_value = ivec4(int(value)); + + if (psc(dims) == 1) + { + ivec4 x4 = gx * 4 - p.left + ivec4(0, 1, 2, 3); + + if (type == 0) + { + bvec4 mask = bvec4(uvec4(greaterThanEqual(x4, ivec4(0))) & uvec4(lessThan(x4, ivec4(psc(w))))); + + ivec4 v; + v.r = mask.r ? i8buffer_ld1(bottom_blob_data, x4.r) : pad_value.r; + v.g = mask.g ? i8buffer_ld1(bottom_blob_data, x4.g) : pad_value.g; + v.b = mask.b ? i8buffer_ld1(bottom_blob_data, x4.b) : pad_value.b; + v.a = mask.a ? i8buffer_ld1(bottom_blob_data, x4.a) : pad_value.a; + + i8buffer_st4(top_blob_data, gx, v); + } + if (type == 1) + { + x4 = clamp(x4, 0, psc(w) - 1); + + i8buffer_cp1to4(top_blob_data, gx, bottom_blob_data, x4); + } + if (type == 2) + { + x4 = abs(x4); + // NOTE psc(X) get zeros on nvidia + // TODO only enable this workaround for some nvidia driver + x4 = (p.w - 1) - abs(x4 - (p.w - 1)); + // x4 = (psc(w) - 1) - abs(x4 - (psc(w) - 1)); + + i8buffer_cp1to4(top_blob_data, gx, bottom_blob_data, x4); + } + } + else if (psc(dims) == 2) + { + const int gi = gy * psc(outw) + gx; + + int x = gx - p.left; + ivec4 y4 = gy * 4 - p.top + ivec4(0, 1, 2, 3); + + if (type == 0) + { + bvec4 mask = bvec4(uvec4(x >= 0 && x < psc(w)) & (uvec4(greaterThanEqual(y4, ivec4(0))) & uvec4(lessThan(y4, ivec4(psc(h)))))); + + ivec4 v_offset = y4 * psc(w) + x; + + ivec4 v; + v.r = mask.r ? i8buffer_ld1(bottom_blob_data, v_offset.r) : pad_value.r; + v.g = mask.g ? i8buffer_ld1(bottom_blob_data, v_offset.g) : pad_value.g; + v.b = mask.b ? i8buffer_ld1(bottom_blob_data, v_offset.b) : pad_value.b; + v.a = mask.a ? i8buffer_ld1(bottom_blob_data, v_offset.a) : pad_value.a; + + i8buffer_st4(top_blob_data, gi, v); + } + if (type == 1) + { + x = clamp(x, 0, psc(w) - 1); + y4 = clamp(y4, 0, psc(h) - 1); + + ivec4 v_offset = y4 * psc(w) + x; + i8buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset); + } + if (type == 2) + { + x = abs(x); + y4 = abs(y4); + // NOTE psc(X) get zeros on nvidia + // TODO only enable this workaround for some nvidia driver + x = (p.w - 1) - abs(x - (p.w - 1)); + y4 = (p.h - 1) - abs(y4 - (p.h - 1)); + // x = (psc(w) - 1) - abs(x - (psc(w) - 1)); + // y4 = (psc(h) - 1) - abs(y4 - (psc(h) - 1)); + + ivec4 v_offset = y4 * psc(w) + x; + i8buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset); + } + } + else // if (psc(dims) == 3) + { + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + int x = gx - p.left; + int y = gy - p.top; + ivec4 z4 = gz * 4 - p.front + ivec4(0, 1, 2, 3); + + if (type == 0) + { + bvec4 mask = bvec4(uvec4(x >= 0 && x < psc(w) && y >= 0 && y < psc(h)) & (uvec4(greaterThanEqual(z4, ivec4(0))) & uvec4(lessThan(z4, ivec4(psc(c)))))); + + ivec4 pad_value4 = per_channel_pad == 1 ? i8buffer_ld4(per_channel_pad_blob_data, gz) : pad_value; + + ivec4 v_offset = z4 * psc(cstep) + y * psc(w) + x; + + ivec4 v; + v.r = mask.r ? i8buffer_ld1(bottom_blob_data, v_offset.r) : pad_value4.r; + v.g = mask.g ? i8buffer_ld1(bottom_blob_data, v_offset.g) : pad_value4.g; + v.b = mask.b ? i8buffer_ld1(bottom_blob_data, v_offset.b) : pad_value4.b; + v.a = mask.a ? i8buffer_ld1(bottom_blob_data, v_offset.a) : pad_value4.a; + + i8buffer_st4(top_blob_data, gi, v); + } + if (type == 1) + { + x = clamp(x, 0, psc(w) - 1); + y = clamp(y, 0, psc(h) - 1); + z4 = clamp(z4, 0, psc(c) - 1); + + ivec4 v_offset = z4 * psc(cstep) + y * psc(w) + x; + i8buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset); + } + if (type == 2) + { + x = abs(x); + y = abs(y); + z4 = abs(z4); + // NOTE psc(X) get zeros on nvidia + // TODO only enable this workaround for some nvidia driver + x = (p.w - 1) - abs(x - (p.w - 1)); + y = (p.h - 1) - abs(y - (p.h - 1)); + z4 = (p.c - 1) - abs(z4 - (p.c - 1)); + // x = (psc(w) - 1) - abs(x - (psc(w) - 1)); + // y = (psc(h) - 1) - abs(y - (psc(h) - 1)); + // z4 = (psc(c) - 1) - abs(z4 - (psc(c) - 1)); + + ivec4 v_offset = z4 * psc(cstep) + y * psc(w) + x; + i8buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset); + } + } +} diff --git a/src/layer/vulkan/shader/padding_pack4_int8.comp b/src/layer/vulkan/shader/padding_pack4_int8.comp new file mode 100644 index 000000000000..a844f83ea3c7 --- /dev/null +++ b/src/layer/vulkan/shader/padding_pack4_int8.comp @@ -0,0 +1,246 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout(constant_id = 0) const int type = 1; +layout(constant_id = 1) const float value = 0; +layout(constant_id = 2) const int per_channel_pad = 0; + +#define shape_constant_id_offset 3 +layout(constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout(constant_id = shape_constant_id_offset + 1) const int w = 0; +layout(constant_id = shape_constant_id_offset + 2) const int h = 0; +layout(constant_id = shape_constant_id_offset + 3) const int c = 0; +layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout(constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout(constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout(constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout(binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_data[]; }; +layout(binding = 1) writeonly buffer top_blob { sint8vec4 top_blob_data[]; }; +layout(binding = 2) readonly buffer per_channel_pad_blob { sint8vec4 per_channel_pad_blob_data[]; }; + +layout(push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + int left; + int top; + int front; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + const ivec4 pad_value = ivec4(int(value)); + + if (psc(dims) == 1) + { + int x = gx - p.left / 4; + + if (type == 0) + { + if (x >= 0 && x < psc(w)) + { + i8buffer_cp4(top_blob_data, gx, bottom_blob_data, x); + } + else + { + i8buffer_st4(top_blob_data, gx, pad_value); + } + } + if (type == 1) + { + x = clamp(x, 0, psc(w) - 1); + + ivec4 v = i8buffer_ld4(bottom_blob_data, x); + + if (gx < p.left / 4) + v = ivec4(v.r); + else if (gx >= psc(w) + p.left / 4) + v = ivec4(v.a); + + i8buffer_st4(top_blob_data, gx, v); + } + if (type == 2) + { + x = abs(x); + // NOTE psc(X) get zeros on nvidia + // TODO only enable this workaround for some nvidia driver + x = (p.w - 1) - abs(x - (p.w - 1)); + // x = (psc(w) - 1) - abs(x - (psc(w) - 1)); + + ivec4 v = i8buffer_ld4(bottom_blob_data, x); + + if (gx < p.left / 4) + { + ivec4 v0 = i8buffer_ld4(bottom_blob_data, x - 1); + v = ivec4(v.r, v0.a, v0.b, v0.g); + } + else if (gx >= psc(w) + p.left / 4) + { + ivec4 v1 = i8buffer_ld4(bottom_blob_data, x + 1); + v = ivec4(v1.b, v1.g, v1.r, v.a); + } + + i8buffer_st4(top_blob_data, gx, v); + } + } + else if (psc(dims) == 2) + { + const int gi = gy * psc(outw) + gx; + + int x = gx - p.left; + int y = gy - p.top / 4; + + if (type == 0) + { + if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h)) + { + int v_offset = y * psc(w) + x; + i8buffer_cp4(top_blob_data, gi, bottom_blob_data, v_offset); + } + else + { + i8buffer_st4(top_blob_data, gi, pad_value); + } + } + if (type == 1) + { + x = clamp(x, 0, psc(w) - 1); + y = clamp(y, 0, psc(h) - 1); + + int v_offset = y * psc(w) + x; + + ivec4 v = i8buffer_ld4(bottom_blob_data, v_offset); + + if (gy < p.top / 4) + v = ivec4(v.r); + else if (gy >= psc(h) + p.top / 4) + v = ivec4(v.a); + + i8buffer_st4(top_blob_data, gi, v); + } + if (type == 2) + { + x = abs(x); + y = abs(y); + // NOTE psc(X) get zeros on nvidia + // TODO only enable this workaround for some nvidia driver + x = (p.w - 1) - abs(x - (p.w - 1)); + y = (p.h - 1) - abs(y - (p.h - 1)); + // x = (psc(w) - 1) - abs(x - (psc(w) - 1)); + // y = (psc(h) - 1) - abs(y - (psc(h) - 1)); + + int v_offset = y * psc(w) + x; + + ivec4 v = i8buffer_ld4(bottom_blob_data, v_offset); + + if (gy < p.top / 4) + { + ivec4 v0 = i8buffer_ld4(bottom_blob_data, v_offset - psc(w)); + v = ivec4(v.r, v0.a, v0.b, v0.g); + } + else if (gy >= psc(h) + p.top / 4) + { + ivec4 v1 = i8buffer_ld4(bottom_blob_data, v_offset + psc(w)); + v = ivec4(v1.b, v1.g, v1.r, v.a); + } + + i8buffer_st4(top_blob_data, gi, v); + } + } + else // if (psc(dims) == 3) + { + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + int x = gx - p.left; + int y = gy - p.top; + int z = gz - p.front / 4; + + if (type == 0) + { + if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h) && z >= 0 && z < psc(c)) + { + int v_offset = z * psc(cstep) + y * psc(w) + x; + i8buffer_cp4(top_blob_data, gi, bottom_blob_data, v_offset); + } + else if (per_channel_pad == 1) + { + i8buffer_cp4(top_blob_data, gi, per_channel_pad_blob_data, gz); + } + else + { + i8buffer_st4(top_blob_data, gi, pad_value); + } + } + if (type == 1) + { + x = clamp(x, 0, psc(w) - 1); + y = clamp(y, 0, psc(h) - 1); + z = clamp(z, 0, psc(c) - 1); + + int v_offset = z * psc(cstep) + y * psc(w) + x; + + ivec4 v = i8buffer_ld4(bottom_blob_data, v_offset); + + if (gz < p.front / 4) + v = ivec4(v.r); + else if (gz >= psc(c) + p.front / 4) + v = ivec4(v.a); + + i8buffer_st4(top_blob_data, gi, v); + } + if (type == 2) + { + x = abs(x); + y = abs(y); + z = abs(z); + // NOTE psc(X) get zeros on nvidia + // TODO only enable this workaround for some nvidia driver + x = (p.w - 1) - abs(x - (p.w - 1)); + y = (p.h - 1) - abs(y - (p.h - 1)); + z = (p.c - 1) - abs(z - (p.c - 1)); + // x = (psc(w) - 1) - abs(x - (psc(w) - 1)); + // y = (psc(h) - 1) - abs(y - (psc(h) - 1)); + // z = (psc(c) - 1) - abs(z - (psc(c) - 1)); + + int v_offset = z * psc(cstep) + y * psc(w) + x; + + ivec4 v = i8buffer_ld4(bottom_blob_data, v_offset); + + if (gz < p.front / 4) + { + ivec4 v0 = i8buffer_ld4(bottom_blob_data, v_offset - psc(cstep)); + v = ivec4(v.r, v0.a, v0.b, v0.g); + } + else if (gz >= psc(c) + p.front / 4) + { + ivec4 v1 = i8buffer_ld4(bottom_blob_data, v_offset + psc(cstep)); + v = ivec4(v1.b, v1.g, v1.r, v.a); + } + + i8buffer_st4(top_blob_data, gi, v); + } + } +} diff --git a/src/layer/vulkan/shader/padding_pack4to1_int8.comp b/src/layer/vulkan/shader/padding_pack4to1_int8.comp new file mode 100644 index 000000000000..062d430d361a --- /dev/null +++ b/src/layer/vulkan/shader/padding_pack4to1_int8.comp @@ -0,0 +1,191 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout(constant_id = 0) const int type = 1; +layout(constant_id = 1) const float value = 0; +layout(constant_id = 2) const int per_channel_pad = 0; + +#define shape_constant_id_offset 3 +layout(constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout(constant_id = shape_constant_id_offset + 1) const int w = 0; +layout(constant_id = shape_constant_id_offset + 2) const int h = 0; +layout(constant_id = shape_constant_id_offset + 3) const int c = 0; +layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout(constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout(constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout(constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout(binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_data[]; }; +layout(binding = 1) writeonly buffer top_blob { sint8 top_blob_data[]; }; +layout(binding = 2) readonly buffer per_channel_pad_blob { sint8 per_channel_pad_blob_data[]; }; + +layout(push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + int left; + int top; + int front; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + const int pad_value = int(value); + + if (psc(dims) == 1) + { + int x = gx - p.left; + + if (type == 0) + { + if (x >= 0 && x < psc(w) * 4) + { + ivec4 v = i8buffer_ld4(bottom_blob_data, x / 4); + i8buffer_st1(top_blob_data, gx, v[x % 4]); + } + else + { + i8buffer_st1(top_blob_data, gx, pad_value); + } + } + if (type == 1) + { + x = clamp(x, 0, psc(w) * 4 - 1); + + ivec4 v = i8buffer_ld4(bottom_blob_data, x / 4); + i8buffer_st1(top_blob_data, gx, v[x % 4]); + } + if (type == 2) + { + x = abs(x); + // NOTE psc(X) get zeros on nvidia + // TODO only enable this workaround for some nvidia driver + x = (p.w * 4 - 1) - abs(x - (p.w * 4 - 1)); + // x = (psc(w) * 4 - 1) - abs(x - (psc(w) * 4 - 1)); + + ivec4 v = i8buffer_ld4(bottom_blob_data, x / 4); + i8buffer_st1(top_blob_data, gx, v[x % 4]); + } + } + else if (psc(dims) == 2) + { + const int gi = gy * psc(outw) + gx; + + int x = gx - p.left; + int y = gy - p.top; + + if (type == 0) + { + if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h) * 4) + { + int v_offset = (y / 4) * psc(w) + x; + ivec4 v = i8buffer_ld4(bottom_blob_data, v_offset); + i8buffer_st1(top_blob_data, gi, v[y % 4]); + } + else + { + i8buffer_st1(top_blob_data, gi, pad_value); + } + } + if (type == 1) + { + x = clamp(x, 0, psc(w) - 1); + y = clamp(y, 0, psc(h) * 4 - 1); + + int v_offset = (y / 4) * psc(w) + x; + ivec4 v = i8buffer_ld4(bottom_blob_data, v_offset); + i8buffer_st1(top_blob_data, gi, v[y % 4]); + } + if (type == 2) + { + x = abs(x); + y = abs(y); + // NOTE psc(X) get zeros on nvidia + // TODO only enable this workaround for some nvidia driver + x = (p.w - 1) - abs(x - (p.w - 1)); + y = (p.h * 4 - 1) - abs(y - (p.h * 4 - 1)); + // x = (psc(w) - 1) - abs(x - (psc(w) - 1)); + // y = (psc(h) * 4 - 1) - abs(y - (psc(h) * 4 - 1)); + + int v_offset = (y / 4) * psc(w) + x; + ivec4 v = i8buffer_ld4(bottom_blob_data, v_offset); + i8buffer_st1(top_blob_data, gi, v[y % 4]); + } + } + else // if (psc(dims) == 3) + { + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + int x = gx - p.left; + int y = gy - p.top; + int z = gz - p.front; + + if (type == 0) + { + if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h) && z >= 0 && z < psc(c) * 4) + { + int v_offset = (z / 4) * psc(cstep) + y * psc(w) + x; + ivec4 v = i8buffer_ld4(bottom_blob_data, v_offset); + i8buffer_st1(top_blob_data, gi, v[z % 4]); + } + else if (per_channel_pad == 1) + { + i8buffer_cp1(top_blob_data, gi, per_channel_pad_blob_data, gz); + } + else + { + i8buffer_st1(top_blob_data, gi, pad_value); + } + } + if (type == 1) + { + x = clamp(x, 0, psc(w) - 1); + y = clamp(y, 0, psc(h) - 1); + z = clamp(z, 0, psc(c) * 4 - 1); + + int v_offset = (z / 4) * psc(cstep) + y * psc(w) + x; + ivec4 v = i8buffer_ld4(bottom_blob_data, v_offset); + i8buffer_st1(top_blob_data, gi, v[z % 4]); + } + if (type == 2) + { + x = abs(x); + y = abs(y); + z = abs(z); + // NOTE psc(X) get zeros on nvidia + // TODO only enable this workaround for some nvidia driver + x = (p.w - 1) - abs(x - (p.w - 1)); + y = (p.h - 1) - abs(y - (p.h - 1)); + z = (p.c * 4 - 1) - abs(z - (p.c * 4 - 1)); + // x = (psc(w) - 1) - abs(x - (psc(w) - 1)); + // y = (psc(h) - 1) - abs(y - (psc(h) - 1)); + // z = (psc(c) * 4 - 1) - abs(z - (psc(c) * 4 - 1)); + + int v_offset = (z / 4) * psc(cstep) + y * psc(w) + x; + ivec4 v = i8buffer_ld4(bottom_blob_data, v_offset); + i8buffer_st1(top_blob_data, gi, v[z % 4]); + } + } +} diff --git a/src/layer/vulkan/shader/quantize.comp b/src/layer/vulkan/shader/quantize.comp index 65dc245f8917..39c30f02b81e 100644 --- a/src/layer/vulkan/shader/quantize.comp +++ b/src/layer/vulkan/shader/quantize.comp @@ -44,7 +44,8 @@ void main() scale = buffer_ld1(scale_blob_data, gy); } - int v_int = int(round(clamp(v * scale, afp(-127.f), afp(127.f)))); + v *= scale; + int v_int = float2int8(v); const uint outgi = gy * psc(out_stride) + gx; diff --git a/src/layer/vulkan/shader/quantize_pack4.comp b/src/layer/vulkan/shader/quantize_pack4.comp index 7ef6d3208547..7603e0758ed0 100644 --- a/src/layer/vulkan/shader/quantize_pack4.comp +++ b/src/layer/vulkan/shader/quantize_pack4.comp @@ -44,7 +44,8 @@ void main() scale = buffer_ld4(scale_blob_data, gy); } - ivec4 v_int = ivec4(round(clamp(v * scale, afp(-127.f), afp(127.f)))); + v *= scale; + ivec4 v_int = float2int8vec4(v); const uint outgi = gy * psc(out_stride) + gx; diff --git a/src/layer/vulkan/shader/requantize.comp b/src/layer/vulkan/shader/requantize.comp index d129e26a81b1..c2eebf00351d 100644 --- a/src/layer/vulkan/shader/requantize.comp +++ b/src/layer/vulkan/shader/requantize.comp @@ -84,7 +84,8 @@ void main() scale_out = buffer_ld1(scale_out_blob_data, gy); } - int v_int = int(round(clamp(v_fp * scale_out, afp(-127.f), afp(127.f)))); + v_fp *= scale_out; + int v_int = float2int8(v_fp); const uint outgi = gy * psc(out_stride) + gx; diff --git a/src/layer/vulkan/shader/requantize_pack4.comp b/src/layer/vulkan/shader/requantize_pack4.comp index 3638b6414ac7..5855c510af39 100644 --- a/src/layer/vulkan/shader/requantize_pack4.comp +++ b/src/layer/vulkan/shader/requantize_pack4.comp @@ -84,7 +84,8 @@ void main() scale_out = buffer_ld4(scale_out_blob_data, gy); } - ivec4 v_int = ivec4(round(clamp(v_fp * scale_out, afp(-127.f), afp(127.f)))); + v_fp *= scale_out; + ivec4 v_int = float2int8vec4(v_fp); const uint outgi = gy * psc(out_stride) + gx; diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp index 6ddabc7360bc..73a113aabdfd 100644 --- a/src/layer/x86/convolution_x86.cpp +++ b/src/layer/x86/convolution_x86.cpp @@ -523,6 +523,13 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option #if NCNN_INT8 if (opt.use_int8_inference && int8_scale_term) { + if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1) + { + NCNN_LOGE("Convolution int8 1d input is not supported, please replace this layer with InnerProduct"); + NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type); + return -1; + } + #if NCNN_BF16 if (opt.use_bf16_storage && bottom_blob.elembits() == 16) { @@ -538,16 +545,12 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option } #endif -#if NCNN_BF16 - if (opt.use_bf16_storage && bottom_blob.elembits() == 16) - { - return forward_bf16s(bottom_blob, top_blob, opt); - } -#endif - // flattened blob, implement as InnerProduct if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1) { + NCNN_LOGE("Convolution 1d input compatibility path is deprecated and will be removed, please replace this layer with InnerProduct"); + NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type); + Mat bottom_blob_3d; if (bottom_blob.elemsize % 16 == 0) { @@ -589,6 +592,13 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option return 0; } +#if NCNN_BF16 + if (opt.use_bf16_storage && bottom_blob.elembits() == 16) + { + return forward_bf16s(bottom_blob, top_blob, opt); + } +#endif + int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; @@ -1366,6 +1376,9 @@ int Convolution_x86::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const // flattened blob, implement as InnerProduct if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1) { + NCNN_LOGE("Convolution 1d input compatibility path is deprecated and will be removed, please replace this layer with InnerProduct"); + NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type); + Mat bottom_blob_3d; if (bottom_blob.elemsize % 16 == 0) { diff --git a/src/net.cpp b/src/net.cpp index 57e5a1241960..8c4d5d5fe80f 100644 --- a/src/net.cpp +++ b/src/net.cpp @@ -1101,6 +1101,7 @@ int Net::load_param(const DataReader& dr) // sanitize use options if (!d->vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false; if (!d->vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false; + if (!d->vkdev->info.support_int16_storage() || !d->vkdev->info.support_int16_arithmetic()) opt.use_int16_storage = false; if (!d->vkdev->info.support_fp16_uniform()) opt.use_fp16_uniform = false; if (!d->vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false; if (!d->vkdev->info.support_int8_packed()) opt.use_int8_packed = false; @@ -1279,12 +1280,6 @@ int Net::load_param(const DataReader& dr) return -1; } - if (layer->support_int8_storage) - { - // no int8 gpu support yet - opt.use_vulkan_compute = false; - } - Option opt1 = get_masked_option(opt, layer->featmask); if (layer_support_vulkan && (!layer->support_vulkan || !opt1.use_vulkan_compute)) @@ -1480,6 +1475,7 @@ int Net::load_param_bin(const DataReader& dr) // sanitize use options if (!d->vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false; if (!d->vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false; + if (!d->vkdev->info.support_int16_storage() || !d->vkdev->info.support_int16_arithmetic()) opt.use_int16_storage = false; if (!d->vkdev->info.support_fp16_uniform()) opt.use_fp16_uniform = false; if (!d->vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false; if (!d->vkdev->info.support_int8_packed()) opt.use_int8_packed = false; @@ -1641,12 +1637,6 @@ int Net::load_param_bin(const DataReader& dr) return -1; } - if (layer->support_int8_storage) - { - // no int8 gpu support yet - opt.use_vulkan_compute = false; - } - Option opt1 = get_masked_option(opt, layer->featmask); if (layer_support_vulkan && (!layer->support_vulkan || !opt1.use_vulkan_compute)) diff --git a/src/option.cpp b/src/option.cpp index 6e03ebc7f663..876b81cc349e 100644 --- a/src/option.cpp +++ b/src/option.cpp @@ -39,7 +39,7 @@ Option::Option() use_fp16_arithmetic = true; use_int8_packed = true; use_int8_storage = true; - use_int8_arithmetic = false; + use_int8_arithmetic = true; use_packing_layout = true; @@ -70,8 +70,8 @@ Option::Option() use_fp16_uniform = true; use_int8_uniform = true; - use_reserved_9 = false; - use_reserved_10 = false; + use_int16_packed = true; + use_int16_storage = true; use_reserved_11 = false; } diff --git a/src/option.h b/src/option.h index b65cb579ec7f..4be2175a7576 100644 --- a/src/option.h +++ b/src/option.h @@ -145,8 +145,9 @@ class NCNN_EXPORT Option bool use_fp16_uniform; bool use_int8_uniform; - bool use_reserved_9; - bool use_reserved_10; + // enable int16 layout options for vulkan int8 shader intermediate data + bool use_int16_packed; + bool use_int16_storage; bool use_reserved_11; }; diff --git a/src/pipelinecache.cpp b/src/pipelinecache.cpp index ab9a121670b6..33f423c7957b 100644 --- a/src/pipelinecache.cpp +++ b/src/pipelinecache.cpp @@ -22,7 +22,7 @@ namespace ncnn { #if NCNN_VULKAN #define NCNN_PIPELINE_CACHE_FILE_MAGIC 0x5a545546 -#define NCNN_PIPELINE_CACHE_FILE_VERSION 1 +#define NCNN_PIPELINE_CACHE_FILE_VERSION 2 #define NCNN_PIPELINE_CACHE_FILE_ENDIAN 0x12345678 #define NCNN_PIPELINE_CACHE_NCNN_VERSION NCNN_VERSION_NUMBER @@ -216,7 +216,9 @@ static uint32_t encode_spirv_cache_opt_bits(const Option& opt) | (uint32_t)opt.use_int8_uniform << 9 | (uint32_t)opt.use_subgroup_ops << 10 | (uint32_t)opt.use_shader_local_memory << 11 - | (uint32_t)opt.use_cooperative_matrix << 12; + | (uint32_t)opt.use_cooperative_matrix << 12 + | (uint32_t)opt.use_int16_packed << 13 + | (uint32_t)opt.use_int16_storage << 14; } static bool can_cache_spirv(const VulkanDevice* vkdev, const Option& opt) diff --git a/tests/test_c_api.cpp b/tests/test_c_api.cpp index 643ed15e3d28..7c00801bc982 100644 --- a/tests/test_c_api.cpp +++ b/tests/test_c_api.cpp @@ -355,6 +355,8 @@ static int test_c_api_3() TEST_OPTION_SET_GET(use_int8_packed, 1, 0) TEST_OPTION_SET_GET(use_int8_storage, 1, 0) TEST_OPTION_SET_GET(use_int8_arithmetic, 1, 0) + TEST_OPTION_SET_GET(use_int16_packed, 1, 0) + TEST_OPTION_SET_GET(use_int16_storage, 1, 0) TEST_OPTION_SET_GET(use_bf16_packed, 1, 0) TEST_OPTION_SET_GET(use_bf16_storage, 1, 0) diff --git a/tests/test_convolution_3.cpp b/tests/test_convolution_3.cpp index 02680fc1e893..8cdd986a711d 100644 --- a/tests/test_convolution_3.cpp +++ b/tests/test_convolution_3.cpp @@ -129,10 +129,14 @@ static int test_convolution_3() } #if NCNN_INT8 -static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, bool requant = false) +static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, bool requant = false, int int8_scale_term = 0, bool sgemm = false, bool input_int8 = false) { ncnn::Mat a = RandomMat(w, h, c); + if (int8_scale_term == 0) + int8_scale_term = requant ? 101 : 1; + const bool use_requant = int8_scale_term > 100; + ncnn::ParamDict pd; pd.set(0, outch); pd.set(1, kernel); @@ -141,7 +145,7 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int pd.set(4, pad); pd.set(5, bias); pd.set(6, outch * c * kernel * kernel); - pd.set(8, requant ? 101 : 1); // int8_scale_term + pd.set(8, int8_scale_term); // int8_scale_term int activation_type = RAND() % 7; // 0 1 2 3 4 5 6 ncnn::Mat activation_params(2); @@ -155,7 +159,16 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int ncnn::Mat weight_scales = scales_mat(weights[0], outch, c * kernel * kernel, c * kernel * kernel); ncnn::Mat input_scales = scales_mat(a, 1, w * h * c, a.cstep); - ncnn::Mat top_scales = requant ? scales_mat(a, 1, w * h * c, a.cstep) : ncnn::Mat(); + ncnn::Mat top_scales = use_requant ? scales_mat(a, 1, w * h * c, a.cstep) : ncnn::Mat(); + + ncnn::Mat a_int8 = a; + if (input_int8) + { + ncnn::Option opt; + opt.num_threads = 1; + opt.use_packing_layout = false; + ncnn::quantize_to_int8(a, a_int8, input_scales, opt); + } if (kernel == 3 && dilation == 1 && stride == 1) { @@ -178,14 +191,36 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int weights[3] = top_scales; } - int flag = TEST_LAYER_DISABLE_GPU_TESTING; - int ret = test_layer("Convolution", pd, weights, a, requant ? 1.0f : 0.001f, flag); + int flag = input_int8 ? TEST_LAYER_DISABLE_AUTO_INPUT_CASTING : 0; + int ret = 0; + if (input_int8) + { + ncnn::Option opt; + opt.num_threads = 1; + opt.use_packing_layout = true; + opt.use_fp16_packed = false; + opt.use_fp16_storage = false; + opt.use_fp16_arithmetic = false; + opt.use_bf16_packed = false; + opt.use_bf16_storage = false; + opt.use_sgemm_convolution = sgemm; + opt.use_winograd_convolution = false; + + ret = test_layer_opt("Convolution", pd, weights, opt, a_int8, use_requant ? 1.0f : 0.001f, flag); + } + else + { + ret = test_layer("Convolution", pd, weights, a_int8, use_requant ? 1.0f : 0.001f, flag); + } if (ret != 0) { - fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]); + fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d int8_scale_term=%d sgemm=%d input_int8=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, int8_scale_term, sgemm, input_int8, activation_type, activation_params[0], activation_params[1]); return ret; } + if (input_int8) + return ret; + if (kernel == 3 && dilation == 1 && stride == 1) { ncnn::Option opt; @@ -201,10 +236,10 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int opt.use_winograd23_convolution = true; opt.use_winograd43_convolution = false; - ret = test_layer_opt("Convolution", pd, weights, opt, a, requant ? 1.0f : 0.001f, flag); + ret = test_layer_opt("Convolution", pd, weights, opt, a, use_requant ? 1.0f : 0.001f, flag); if (ret != 0) { - fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]); + fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d int8_scale_term=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, int8_scale_term, activation_type, activation_params[0], activation_params[1]); return ret; } } @@ -221,10 +256,10 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int opt.use_sgemm_convolution = false; opt.use_winograd_convolution = false; - ret = test_layer_opt("Convolution", pd, weights, opt, a, requant ? 1.0f : 0.001f, flag); + ret = test_layer_opt("Convolution", pd, weights, opt, a, use_requant ? 1.0f : 0.001f, flag); if (ret != 0) { - fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]); + fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d int8_scale_term=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, int8_scale_term, activation_type, activation_params[0], activation_params[1]); return ret; } } @@ -241,10 +276,10 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int opt.use_sgemm_convolution = false; opt.use_winograd_convolution = false; - ret = test_layer_opt("Convolution", pd, weights, opt, a, requant ? 1.0f : 0.001f, flag); + ret = test_layer_opt("Convolution", pd, weights, opt, a, use_requant ? 1.0f : 0.001f, flag); if (ret != 0) { - fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]); + fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d int8_scale_term=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, int8_scale_term, activation_type, activation_params[0], activation_params[1]); return ret; } } @@ -261,10 +296,31 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int opt.use_sgemm_convolution = false; opt.use_winograd_convolution = false; - ret = test_layer_opt("Convolution", pd, weights, opt, a, requant ? 1.0f : 0.001f, flag); + ret = test_layer_opt("Convolution", pd, weights, opt, a, use_requant ? 1.0f : 0.001f, flag); + if (ret != 0) + { + fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d int8_scale_term=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, int8_scale_term, activation_type, activation_params[0], activation_params[1]); + return ret; + } + } + + if (sgemm) + { + ncnn::Option opt; + opt.num_threads = 1; + opt.use_packing_layout = true; + opt.use_fp16_packed = false; + opt.use_fp16_storage = false; + opt.use_fp16_arithmetic = false; + opt.use_bf16_packed = false; + opt.use_bf16_storage = false; + opt.use_sgemm_convolution = true; + opt.use_winograd_convolution = false; + + ret = test_layer_opt("Convolution", pd, weights, opt, a, use_requant ? 1.0f : 0.001f, flag); if (ret != 0) { - fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]); + fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d int8_scale_term=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, int8_scale_term, activation_type, activation_params[0], activation_params[1]); return ret; } } @@ -356,7 +412,22 @@ static int test_convolution_1() || test_convolution_int8(3, 9, 16, 13, 2, 2, 1, 0, 0) || test_convolution_int8(33, 5, 15, 5, 2, 1, 3, 0, 1) || test_convolution_int8(23, 11, 33, 28, 5, 1, 1, 0, 1) - || test_convolution_int8(3, 63, 2, 28, 2, 1, 2, 0, 0); + || test_convolution_int8(3, 63, 2, 28, 2, 1, 2, 0, 0) + || test_convolution_int8(7, 5, 4, 8, 1, 1, 1, 0, 1, false, 2) + || test_convolution_int8(7, 5, 4, 8, 1, 1, 1, 0, 1, true, 102) + || test_convolution_int8(9, 7, 8, 12, 2, 1, 2, 1, 1, false, 1, true) + || test_convolution_int8(9, 7, 8, 12, 2, 1, 2, 1, 1, true, 101, true); +} + +static int test_convolution_1_int8_input() +{ + return 0 + || test_convolution_int8(7, 5, 1, 1, 3, 1, 1, 1, 1, false, 1, false, true) + || test_convolution_int8(7, 5, 4, 4, 3, 1, 1, 1, 1, false, 1, false, true) + || test_convolution_int8(8, 6, 4, 8, 1, 1, 1, 0, 1, false, 1, false, true) + || test_convolution_int8(8, 6, 4, 8, 1, 1, 1, 0, 1, false, 2, false, true) + || test_convolution_int8(8, 6, 4, 8, 1, 1, 1, 0, 1, true, 102, false, true) + || test_convolution_int8(9, 7, 8, 8, 2, 1, 1, 1, 1, false, 1, true, true); } static int test_convolution_1_2() @@ -443,6 +514,7 @@ int main() #if NCNN_INT8 return 0 || test_convolution_1() + || test_convolution_1_int8_input() || test_convolution_1_2() || test_convolution_2() || test_convolution_3(); diff --git a/tests/test_convolutiondepthwise_1.cpp b/tests/test_convolutiondepthwise_1.cpp index ae408ba9849b..85c52dc38d30 100644 --- a/tests/test_convolutiondepthwise_1.cpp +++ b/tests/test_convolutiondepthwise_1.cpp @@ -84,10 +84,14 @@ static int test_convolutiondepthwise_2() } #if NCNN_INT8 -static int test_convolutiondepthwise_int8(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int group, bool requant = false) +static int test_convolutiondepthwise_int8(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int group, bool requant = false, int int8_scale_term = 0, bool input_int8 = false) { ncnn::Mat a = RandomMat(w, h, c); + if (int8_scale_term == 0) + int8_scale_term = requant ? 101 : 1; + const bool use_requant = int8_scale_term > 100; + ncnn::ParamDict pd; pd.set(0, outch); pd.set(1, kernel); @@ -97,7 +101,7 @@ static int test_convolutiondepthwise_int8(int w, int h, int c, int outch, int ke pd.set(5, bias); pd.set(6, outch / group * c / group * kernel * kernel * group); pd.set(7, group); - pd.set(8, requant ? 101 : 1); // int8_scale_term + pd.set(8, int8_scale_term); // int8_scale_term int activation_type = RAND() % 7; // 0 1 2 3 4 5 6 ncnn::Mat activation_params(2); @@ -108,9 +112,23 @@ static int test_convolutiondepthwise_int8(int w, int h, int c, int outch, int ke std::vector weights(bias ? 5 : 4); weights[0] = RandomMat(outch / group * c / group * kernel * kernel * group); - ncnn::Mat weight_scales = scales_mat(weights[0], group, c * kernel * kernel / group, c * kernel * kernel / group); + ncnn::Mat weight_scales; + if (int8_scale_term == 2 || int8_scale_term == 102) + weight_scales = scales_mat(weights[0], 1, weights[0].w, weights[0].w); + else + weight_scales = scales_mat(weights[0], group, c * kernel * kernel / group, c * kernel * kernel / group); ncnn::Mat input_scales = scales_mat(a, 1, w * h * c, a.cstep); - ncnn::Mat top_scales = requant ? scales_mat(a, 1, w * h * c, a.cstep) : ncnn::Mat(); + ncnn::Mat top_scales = use_requant ? scales_mat(a, 1, w * h * c, a.cstep) : ncnn::Mat(); + + ncnn::Mat a_int8 = a; + if (input_int8) + { + ncnn::Option opt; + opt.num_threads = 1; + opt.use_packing_layout = false; + ncnn::quantize_to_int8(a, a_int8, input_scales, opt); + } + if (bias) { weights[1] = RandomMat(outch); @@ -125,11 +143,28 @@ static int test_convolutiondepthwise_int8(int w, int h, int c, int outch, int ke weights[3] = top_scales; } - int flag = TEST_LAYER_DISABLE_GPU_TESTING; - int ret = test_layer("ConvolutionDepthWise", pd, weights, a, requant ? 1.0f : 0.001f, flag); + int flag = input_int8 ? TEST_LAYER_DISABLE_AUTO_INPUT_CASTING : 0; + int ret = 0; + if (input_int8) + { + ncnn::Option opt; + opt.num_threads = 1; + opt.use_packing_layout = true; + opt.use_fp16_packed = false; + opt.use_fp16_storage = false; + opt.use_fp16_arithmetic = false; + opt.use_bf16_packed = false; + opt.use_bf16_storage = false; + + ret = test_layer_opt("ConvolutionDepthWise", pd, weights, opt, a_int8, use_requant ? 1.0f : 0.001f, flag); + } + else + { + ret = test_layer("ConvolutionDepthWise", pd, weights, a_int8, use_requant ? 1.0f : 0.001f, flag); + } if (ret != 0) { - fprintf(stderr, "test_convolutiondepthwise_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, group, requant, activation_type, activation_params[0], activation_params[1]); + fprintf(stderr, "test_convolutiondepthwise_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d int8_scale_term=%d input_int8=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, group, int8_scale_term, input_int8, activation_type, activation_params[0], activation_params[1]); } return ret; @@ -176,7 +211,9 @@ static int test_convolutiondepthwise_1() || test_convolutiondepthwise_int8(15, 7, 12, 12, k, d, s, p, 0, 4) || test_convolutiondepthwise_int8(15, 7, 15, 15, k, d, s, p, 1, 15) || test_convolutiondepthwise_int8(15, 7, 16, 8, k, d, s, p, 0, 2) - || test_convolutiondepthwise_int8(15, 7, 16, 16, k, d, s, p, 1, 16); + || test_convolutiondepthwise_int8(15, 7, 16, 16, k, d, s, p, 1, 16) + || test_convolutiondepthwise_int8(15, 7, 8, 8, k, d, s, p, 1, 8, false, 2) + || test_convolutiondepthwise_int8(15, 7, 8, 8, k, d, s, p, 1, 2, false, 2); if (ret != 0) return -1; @@ -202,7 +239,9 @@ static int test_convolutiondepthwise_1() || test_convolutiondepthwise_int8(9, 7, 12, 12, k, d, s, p, 0, 4, true) || test_convolutiondepthwise_int8(9, 7, 15, 15, k, d, s, p, 1, 15, true) || test_convolutiondepthwise_int8(9, 7, 16, 8, k, d, s, p, 0, 2, true) - || test_convolutiondepthwise_int8(9, 7, 16, 16, k, d, s, p, 1, 16, true); + || test_convolutiondepthwise_int8(9, 7, 16, 16, k, d, s, p, 1, 16, true) + || test_convolutiondepthwise_int8(9, 7, 8, 8, k, d, s, p, 1, 8, true, 102) + || test_convolutiondepthwise_int8(9, 7, 8, 8, k, d, s, p, 1, 2, true, 102); if (ret != 0) return -1; @@ -210,6 +249,17 @@ static int test_convolutiondepthwise_1() return 0; } + +static int test_convolutiondepthwise_1_int8_input() +{ + return 0 + || test_convolutiondepthwise_int8(9, 7, 1, 1, 3, 1, 1, 1, 1, 1, false, 1, true) + || test_convolutiondepthwise_int8(9, 7, 4, 4, 3, 1, 1, 1, 1, 4, false, 1, true) + || test_convolutiondepthwise_int8(9, 7, 8, 8, 3, 1, 1, 1, 1, 2, false, 1, true) + || test_convolutiondepthwise_int8(9, 7, 8, 8, 3, 1, 1, 1, 1, 2, false, 2, true) + || test_convolutiondepthwise_int8(9, 7, 8, 8, 3, 1, 1, 1, 1, 8, true, 101, true) + || test_convolutiondepthwise_int8(9, 7, 8, 8, 3, 1, 1, 1, 1, 2, true, 102, true); +} #endif // NCNN_INT8 int main() @@ -217,7 +267,7 @@ int main() SRAND(7767517); #if NCNN_INT8 - return test_convolutiondepthwise_1() || test_convolutiondepthwise_2(); + return test_convolutiondepthwise_1() || test_convolutiondepthwise_1_int8_input() || test_convolutiondepthwise_2(); #else return test_convolutiondepthwise_2(); #endif diff --git a/tests/test_gemm_3.cpp b/tests/test_gemm_3.cpp index f0224ca4786b..9a7eeeb6c759 100644 --- a/tests/test_gemm_3.cpp +++ b/tests/test_gemm_3.cpp @@ -218,7 +218,16 @@ static int test_gemm_int8(int M, int N, int K, float alpha, int transA, int tran RandomizeB(a[a.size() - 1], 10.f); } - int ret = test_layer("Gemm", pd, weights, a); + ncnn::Option opt; + opt.num_threads = 1; + opt.use_packing_layout = true; + opt.use_fp16_packed = false; + opt.use_fp16_storage = false; + opt.use_fp16_arithmetic = false; + opt.use_bf16_packed = false; + opt.use_bf16_storage = false; + + int ret = test_layer_opt("Gemm", pd, weights, opt, a); if (ret != 0) { fprintf(stderr, "test_gemm_int8 failed M=%d N=%d K=%d alpha=%f transA=%d transB=%d output_elemtype=%d output_transpose=%d constantA=%d constantB=%d output_N1M=%d\n", M, N, K, alpha, transA, transB, output_elemtype, output_transpose, constantA, constantB, output_N1M); @@ -227,7 +236,7 @@ static int test_gemm_int8(int M, int N, int K, float alpha, int transA, int tran return ret; } -static int test_gemm_int8_bias(int M, int N, int K, const ncnn::Mat& C, float alpha, float beta, int transA, int transB, int output_elemtype, int output_transpose, int constantA, int constantB, int constantC) +static int test_gemm_int8_bias(int M, int N, int K, const ncnn::Mat& C, float alpha, float beta, int transA, int transB, int output_elemtype, int output_transpose, int constantA, int constantB, int constantC, bool use_bf16 = false, int output_N1M = 0) { int broadcast_type_C = 0; if (C.dims == 1 && C.w == 1) @@ -274,6 +283,7 @@ static int test_gemm_int8_bias(int M, int N, int K, const ncnn::Mat& C, float al pd.set(8, N); pd.set(9, K); pd.set(10, broadcast_type_C); + pd.set(11, output_N1M); // pd.set(12, 1); // output_elempack pd.set(13, output_elemtype); pd.set(14, output_transpose); @@ -289,20 +299,29 @@ static int test_gemm_int8_bias(int M, int N, int K, const ncnn::Mat& C, float al std::vector a; if (!constantA) { - a.push_back(transA ? ncnn::Mat(M, K) : ncnn::Mat(K, M)); + a.push_back(transA ? (output_N1M ? ncnn::Mat(M, 1, K) : ncnn::Mat(M, K)) : (output_N1M ? ncnn::Mat(K, 1, M) : ncnn::Mat(K, M))); RandomizeA(a[a.size() - 1], transA, 10.f); } if (!constantB) { - a.push_back(transB ? ncnn::Mat(K, N) : ncnn::Mat(N, K)); + a.push_back(transB ? (output_N1M ? ncnn::Mat(K, 1, N) : ncnn::Mat(K, N)) : (output_N1M ? ncnn::Mat(N, 1, K) : ncnn::Mat(N, K))); RandomizeB(a[a.size() - 1], 10.f); } if (!constantC) a.push_back(C); - int ret = test_layer("Gemm", pd, weights, a); + ncnn::Option opt; + opt.num_threads = 1; + opt.use_packing_layout = true; + opt.use_fp16_packed = false; + opt.use_fp16_storage = false; + opt.use_fp16_arithmetic = false; + opt.use_bf16_packed = use_bf16; + opt.use_bf16_storage = use_bf16; + + int ret = test_layer_opt("Gemm", pd, weights, opt, a); if (ret != 0) { - fprintf(stderr, "test_gemm_int8_bias failed M=%d N=%d K=%d C.dims=%d C=(%d %d %d) alpha=%f beta=%f transA=%d transB=%d output_elemtype=%d output_transpose=%d constantA=%d constantB=%d constantC=%d\n", M, N, K, C.dims, C.w, C.h, C.c, alpha, beta, transA, transB, output_elemtype, output_transpose, constantA, constantB, constantC); + fprintf(stderr, "test_gemm_int8_bias failed M=%d N=%d K=%d C.dims=%d C=(%d %d %d) alpha=%f beta=%f transA=%d transB=%d output_elemtype=%d output_transpose=%d constantA=%d constantB=%d constantC=%d use_bf16=%d output_N1M=%d\n", M, N, K, C.dims, C.w, C.h, C.c, alpha, beta, transA, transB, output_elemtype, output_transpose, constantA, constantB, constantC, use_bf16, output_N1M); } return ret; @@ -366,6 +385,64 @@ static int test_gemm_int8_fp16s(int M, int N, int K, float alpha, int transA, in return 0; } +static int test_gemm_int8_bf16s(int M, int N, int K, float alpha, int transA, int transB, int output_elemtype, int output_transpose, int constantA, int constantB, int output_N1M) +{ + ncnn::ParamDict pd; + pd.set(0, alpha); + pd.set(1, 1.f); // beta + pd.set(2, transA); + pd.set(3, transB); + pd.set(4, constantA); + pd.set(5, constantB); + pd.set(6, 1); + pd.set(7, M); + pd.set(8, N); + pd.set(9, K); + pd.set(10, -1); + pd.set(11, output_N1M); + pd.set(13, output_elemtype); + pd.set(14, output_transpose); + pd.set(18, 2); // int8_scale_term + + std::vector weights; + if (constantA) weights.push_back(transA ? RandomS8Mat(M, K) : RandomS8Mat(K, M)); + if (constantB) weights.push_back(transB ? RandomS8Mat(K, N) : RandomS8Mat(N, K)); + if (constantA) weights.push_back(RandomMat(M, 10.f, 20.f)); + if (constantB) weights.push_back(RandomMat(1, 10.f, 20.f)); + + std::vector a; + if (!constantA) + { + a.push_back(transA ? (output_N1M ? ncnn::Mat(M, 1, K) : ncnn::Mat(M, K)) : (output_N1M ? ncnn::Mat(K, 1, M) : ncnn::Mat(K, M))); + RandomizeA(a[a.size() - 1], transA, 10.f); + } + if (!constantB) + { + a.push_back(transB ? (output_N1M ? ncnn::Mat(K, 1, N) : ncnn::Mat(K, N)) : (output_N1M ? ncnn::Mat(N, 1, K) : ncnn::Mat(N, K))); + RandomizeB(a[a.size() - 1], 10.f); + } + + ncnn::Option opt; + opt.num_threads = 1; + opt.use_packing_layout = true; + opt.use_fp16_packed = false; + opt.use_fp16_storage = false; + opt.use_fp16_arithmetic = false; + opt.use_bf16_packed = true; + opt.use_bf16_storage = true; + + float epsilon = 0.001; + + int ret = test_layer_opt("Gemm", pd, weights, opt, a, 1, epsilon); + if (ret != 0) + { + fprintf(stderr, "test_gemm_int8_bf16s failed M=%d N=%d K=%d alpha=%f transA=%d transB=%d output_elemtype=%d output_transpose=%d constantA=%d constantB=%d output_N1M=%d\n", M, N, K, alpha, transA, transB, output_elemtype, output_transpose, constantA, constantB, output_N1M); + return ret; + } + + return 0; +} + static int test_gemm_0(int M, int N, int K) { return 0 @@ -422,6 +499,9 @@ static int test_gemm_1(int M, int N, int K) || test_gemm_int8_bias(M, N, K, RandomMat(N), 0.8f, 1.f, 0, 0, 1, 1, 0, 0, 0) || test_gemm_int8_bias(M, N, K, RandomMat(N), 3.1f, -0.6f, 0, 1, 2, 0, 0, 0, 0) || test_gemm_int8_bias(M, N, K, RandomMat(N), 3.1f, -0.6f, 0, 1, 3, 1, 0, 0, 0) + || test_gemm_int8_bias(M, N, K, RandomMat(1), 1.7f, -0.4f, 0, 0, 0, 0, 0, 0, 0, false, 1) + || test_gemm_int8_bias(M, N, K, RandomMat(M), -1.3f, 0.6f, 1, 0, 0, 1, 0, 0, 0, false, 1) + || test_gemm_int8_bias(M, N, K, RandomMat(N, M), 0.8f, 0.5f, 0, 1, 0, 0, 0, 0, 0, false, 1) || test_gemm_int8_bias(M, N, K, RandomMat(1), -2.1f, 0.5f, 0, 0, 0, 0, 1, 1, 1) || test_gemm_int8_bias(M, N, K, RandomMat(1), -2.1f, 0.5f, 0, 0, 1, 1, 1, 1, 1) @@ -504,6 +584,13 @@ int main() return ret; } } + + if (test_gemm_int8_bf16s(12, 23, 12, 1.f, 0, 1, 0, 0, 0, 0, 0) + || test_gemm_int8_bf16s(12, 23, 12, 1.f, 1, 0, 0, 1, 0, 0, 0) + || test_gemm_int8_bias(12, 23, 12, RandomMat(23), 1.7f, -0.4f, 0, 1, 0, 1, 0, 0, 0, true)) + { + return -1; + } #else // test nothing for non-int8 build #endif diff --git a/tests/test_gemm_4.cpp b/tests/test_gemm_4.cpp index 911a03bbd6f2..8d350e671170 100644 --- a/tests/test_gemm_4.cpp +++ b/tests/test_gemm_4.cpp @@ -204,7 +204,16 @@ static int test_gemm_int8(int M, int N, int K, int TILE_M, int TILE_N, int TILE_ RandomizeA(a[0], transA, 10.f); RandomizeB(a[1], 10.f); - int ret = test_layer("Gemm", pd, weights, a); + ncnn::Option opt; + opt.num_threads = 1; + opt.use_packing_layout = true; + opt.use_fp16_packed = false; + opt.use_fp16_storage = false; + opt.use_fp16_arithmetic = false; + opt.use_bf16_packed = false; + opt.use_bf16_storage = false; + + int ret = test_layer_opt("Gemm", pd, weights, opt, a); if (ret != 0) { fprintf(stderr, "test_gemm_int8 failed M=%d N=%d K=%d TILE_M=%d TILE_N=%d TILE_K=%d alpha=%f transA=%d transB=%d output_transpose=%d\n", M, N, K, TILE_M, TILE_N, TILE_K, alpha, transA, transB, output_transpose); diff --git a/tests/test_innerproduct.cpp b/tests/test_innerproduct.cpp index 921225308d88..66a2fbddda06 100644 --- a/tests/test_innerproduct.cpp +++ b/tests/test_innerproduct.cpp @@ -86,7 +86,7 @@ static int test_innerproduct_3() } #if NCNN_INT8 -static int test_innerproduct_int8(const ncnn::Mat& a, int outch, int bias) +static int test_innerproduct_int8(const ncnn::Mat& a, int outch, int bias, bool input_int8 = false, bool weight_int8 = false) { ncnn::ParamDict pd; pd.set(0, outch); // num_output @@ -103,10 +103,19 @@ static int test_innerproduct_int8(const ncnn::Mat& a, int outch, int bias) std::vector weights(bias ? 4 : 3); const int k = a.w * a.h * a.d * a.c; - weights[0] = RandomMat(outch * k); - ncnn::Mat weight_scales = scales_mat(weights[0], outch, k, k); + weights[0] = weight_int8 ? RandomS8Mat(outch * k) : RandomMat(outch * k); + ncnn::Mat weight_scales = weight_int8 ? RandomMat(outch, 10.f, 20.f) : scales_mat(weights[0], outch, k, k); ncnn::Mat input_scales = scales_mat(a, 1, k, k); + ncnn::Mat a_int8 = a; + if (input_int8) + { + ncnn::Option opt; + opt.num_threads = 1; + opt.use_packing_layout = false; + ncnn::quantize_to_int8(a, a_int8, input_scales, opt); + } + if (bias) { weights[1] = RandomMat(outch); @@ -119,11 +128,28 @@ static int test_innerproduct_int8(const ncnn::Mat& a, int outch, int bias) weights[2] = input_scales; } - int flag = TEST_LAYER_DISABLE_GPU_TESTING; - int ret = test_layer("InnerProduct", pd, weights, a, 0.001f, flag); + int flag = input_int8 ? TEST_LAYER_DISABLE_AUTO_INPUT_CASTING : 0; + int ret = 0; + if (input_int8) + { + ncnn::Option opt; + opt.num_threads = 1; + opt.use_packing_layout = true; + opt.use_fp16_packed = false; + opt.use_fp16_storage = false; + opt.use_fp16_arithmetic = false; + opt.use_bf16_packed = false; + opt.use_bf16_storage = false; + + ret = test_layer_opt("InnerProduct", pd, weights, opt, a_int8, 0.001f, flag); + } + else + { + ret = test_layer("InnerProduct", pd, weights, a_int8, 0.001f, flag); + } if (ret != 0) { - fprintf(stderr, "test_innerproduct_int8 failed a.dims=%d a=(%d %d %d %d) outch=%d bias=%d act=%d actparams=[%f,%f]\n", a.dims, a.w, a.h, a.d, a.c, outch, bias, activation_type, activation_params[0], activation_params[1]); + fprintf(stderr, "test_innerproduct_int8 failed a.dims=%d a=(%d %d %d %d) outch=%d bias=%d input_int8=%d weight_int8=%d act=%d actparams=[%f,%f]\n", a.dims, a.w, a.h, a.d, a.c, outch, bias, input_int8, weight_int8, activation_type, activation_params[0], activation_params[1]); } return ret; @@ -142,7 +168,14 @@ static int test_innerproduct_4() || test_innerproduct_int8(RandomMat(6, 2, 8), 8, 1) || test_innerproduct_int8(RandomMat(8, 3, 15), 15, 1) || test_innerproduct_int8(RandomMat(7, 2, 16), 4, 1) - || test_innerproduct_int8(RandomMat(6, 3, 16), 16, 1); + || test_innerproduct_int8(RandomMat(6, 3, 16), 16, 1) + || test_innerproduct_int8(RandomMat(16), 16, 1, true) + || test_innerproduct_int8(RandomMat(32), 16, 1, false, true) + || test_innerproduct_int8(RandomMat(16), 12, 1, true, true) + || test_innerproduct_int8(RandomMat(2, 2, 1), 7, 1, true) + || test_innerproduct_int8(RandomMat(2, 2, 2), 7, 1, true) + || test_innerproduct_int8(RandomMat(2, 2, 3), 7, 1, true) + || test_innerproduct_int8(RandomMat(2, 2, 4), 8, 1, true); } #endif // NCNN_INT8 @@ -205,7 +238,7 @@ static int test_innerproduct_5() } #if NCNN_INT8 -static int test_innerproduct_gemm_int8(const ncnn::Mat& a, int outch, int bias) +static int test_innerproduct_gemm_int8(const ncnn::Mat& a, int outch, int bias, bool input_int8 = false, bool weight_int8 = false) { ncnn::ParamDict pd; pd.set(0, outch); @@ -215,10 +248,19 @@ static int test_innerproduct_gemm_int8(const ncnn::Mat& a, int outch, int bias) std::vector weights(bias ? 4 : 3); const int k = a.w; - weights[0] = RandomMat(outch * k); - ncnn::Mat weight_scales = scales_mat(weights[0], outch, k, k); + weights[0] = weight_int8 ? RandomS8Mat(outch * k) : RandomMat(outch * k); + ncnn::Mat weight_scales = weight_int8 ? RandomMat(outch, 10.f, 20.f) : scales_mat(weights[0], outch, k, k); ncnn::Mat input_scales = scales_mat(a, 1, k, k); + ncnn::Mat a_int8 = a; + if (input_int8) + { + ncnn::Option opt; + opt.num_threads = 1; + opt.use_packing_layout = false; + ncnn::quantize_to_int8(a, a_int8, input_scales, opt); + } + if (bias) { weights[1] = RandomMat(outch); @@ -231,11 +273,28 @@ static int test_innerproduct_gemm_int8(const ncnn::Mat& a, int outch, int bias) weights[2] = input_scales; } - int flag = TEST_LAYER_DISABLE_GPU_TESTING; - int ret = test_layer("InnerProduct", pd, weights, a, 0.001f, flag); + int flag = input_int8 ? TEST_LAYER_DISABLE_AUTO_INPUT_CASTING : 0; + int ret = 0; + if (input_int8) + { + ncnn::Option opt; + opt.num_threads = 1; + opt.use_packing_layout = true; + opt.use_fp16_packed = false; + opt.use_fp16_storage = false; + opt.use_fp16_arithmetic = false; + opt.use_bf16_packed = false; + opt.use_bf16_storage = false; + + ret = test_layer_opt("InnerProduct", pd, weights, opt, a_int8, 0.001f, flag); + } + else + { + ret = test_layer("InnerProduct", pd, weights, a_int8, 0.001f, flag); + } if (ret != 0) { - fprintf(stderr, "test_innerproduct_gemm_int8 failed a.dims=%d a=(%d %d %d) outch=%d bias=%d\n", a.dims, a.w, a.h, a.c, outch, bias); + fprintf(stderr, "test_innerproduct_gemm_int8 failed a.dims=%d a=(%d %d %d) outch=%d bias=%d input_int8=%d weight_int8=%d\n", a.dims, a.w, a.h, a.c, outch, bias, input_int8, weight_int8); } return ret; @@ -252,7 +311,11 @@ static int test_innerproduct_6() || test_innerproduct_gemm_int8(RandomMat(16, 12), 16, 0) || test_innerproduct_gemm_int8(RandomMat(4, 15), 8, 1) || test_innerproduct_gemm_int8(RandomMat(6, 16), 16, 0) - || test_innerproduct_gemm_int8(RandomMat(12, 16), 7, 1); + || test_innerproduct_gemm_int8(RandomMat(12, 16), 7, 1) + || test_innerproduct_gemm_int8(RandomMat(11, 16), 8, 1, false, true) + || test_innerproduct_gemm_int8(RandomMat(13, 15), 7, 1, true) + || test_innerproduct_gemm_int8(RandomMat(12, 16), 7, 1, true) + || test_innerproduct_gemm_int8(RandomMat(12, 16), 7, 1, true, true); } static int test_innerproduct_7() diff --git a/tests/test_padding.cpp b/tests/test_padding.cpp index 6d7c07a174cc..4e4190d5780a 100644 --- a/tests/test_padding.cpp +++ b/tests/test_padding.cpp @@ -237,7 +237,7 @@ static int test_padding_int8(const ncnn::Mat& a, int top, int bottom, int left, if (per_channel_pad_data_size) weights[0] = RandomMat(per_channel_pad_data_size); - int flag = TEST_LAYER_DISABLE_AUTO_INPUT_CASTING | TEST_LAYER_DISABLE_GPU_TESTING; + int flag = TEST_LAYER_DISABLE_AUTO_INPUT_CASTING; int ret = test_layer("Padding", pd, weights, a, 0.001, flag); if (ret != 0) {