見出し画像

シストリックアレイを作ってみよう【完成編】

背景

Systolic array、RISC-V、開発環境
組み上げて完成させよう。

全体図

このような構成です。

コード

作成開始
https://github.com/rmbmp717/SystolicArray/

ディレクトリ構成

.
├── Python_model
├── RISCV
|   |── RV32IM_FPGA_PIPELINE.v
|   |── RV32IM_FPGA_PIPELINE_SUP.v
├── Verilog
    |── SA4x4
    |    |── SystolicArray4x4_top.v
    |    |── SystolicArray4x4.v 
    ├── SA8x8
    |    |── SystolicArray8x8_top.v
    |    |── SystolicArray8x8.v 
├── c_program
    |── SA1.c
    ├── SA2.c
├── FPGA

先々週にcocotbでの制御信号コードは作成済み。制御信号をRISCVを用いて制御すればいいと。
トップモジュールは下。
make実行で

  1. Cプログラムをコンパイル

  2. hexコードに変換

  3. RiscVプロセッサ起動

  4. SAにデータセット

  5. シフト動作

  6. データをRiscVに戻す。

を実行していきます。
下がトップモジュールのコードです。uartにシフトパルス制御モジュールとA Matrixデータ入力モジュール、B Matrixデータ入力モジュールが接続されてます。

`timescale 1ns / 1ps

module SystolicArray4x4_top (
    // Ports from the upper module
    input  wire         Clock,
    input  wire         rst_n,
    input  wire         data_clear,
    input  wire         en_b_shift_bottom,
    input  wire         en_shift_right,
    input  wire         en_shift_bottom,

    input  wire [15:0]  a_left_in_flat   [0:3],
    input  wire [15:0]  b_top_in_flat    [0:3],
    input  wire [15:0]  ps_top_in_flat   [0:3],

    output wire [15:0]  ps_bottom_out_flat [0:3]
);

    wire       uart_rw;
    wire [7:0] uart_data;

    // =================================================================
    // 1. Instantiate the submodule (SystolicArray4x4)
    // =================================================================
    SystolicArray4x4 u_systolic (
        .Clock              (Clock),
        .rst_n              (rst_n),
        .data_clear         (data_clear),
        .en_b_shift_bottom  (en_b_shift_bottom),
        .en_shift_right     (en_shift_right),
        .en_shift_bottom    (en_shift_bottom),

        .a_left_in_flat     (a_left_in_flat),
        .b_top_in_flat      (b_top_in_flat),
        .ps_top_in_flat     (ps_top_in_flat),

        .ps_bottom_out_flat (ps_bottom_out_flat)
    );

    wire [15:0] bm_data0;
    wire [15:0] bm_data1;
    wire [15:0] bm_data2;
    wire [15:0] bm_data3;
    assign  bm_data0 = ps_bottom_out_flat[0];
    assign  bm_data1 = ps_bottom_out_flat[1];
    assign  bm_data2 = ps_bottom_out_flat[2];
    assign  bm_data3 = ps_bottom_out_flat[3];

    wire [31:0] dma_in_data0;
    wire [31:0] dma_in_data1;
    assign  dma_in_data0 = {{bm_data1}, {bm_data0}};
    assign  dma_in_data1 = {{bm_data3}, {bm_data2}};

    // RISC-V processor instance
    RV32IM uRV32IM(
        // Clock & Reset
        .clock              (Clock),
        .reset_n            (rst_n),
        .uart_out           ({{uart_rw}, {uart_data}}),
        .DMA_in0            (dma_in_data0),
        .DMA_in1            (dma_in_data1)
    );

    // Shift enable for right shift
    shift_module #(
        .EN_SHIFT_ADDR      (8'hFF)
        ) right_shift_module(
        // Clock & Reset
        .Clock              (Clock),
        .rst_n              (rst_n),
        // UART interface
        .uart_rw            (uart_rw),
        .uart_in            (uart_data),
        .shift              (en_shift_right)
    );

    // Shift enable for b data shift
    shift_module #(
        .EN_SHIFT_ADDR      (8'hFE)
        ) b_shift_module(
        // Clock & Reset
        .Clock              (Clock),
        .rst_n              (rst_n),
        // UART interface
        .uart_rw            (uart_rw),
        .uart_in            (uart_data),
        .shift              (en_b_shift_bottom)
    );

    // Shift enable for bottom shift
    shift_module #(
        .EN_SHIFT_ADDR      (8'hFD)
        ) bottom_shift_module(
        // Clock & Reset
        .Clock              (Clock),
        .rst_n              (rst_n),
        // UART interface
        .uart_rw            (uart_rw),
        .uart_in            (uart_data),
        .shift              (en_shift_bottom)
    );

    // Data input module for matrix A
    data_16x4_module #(
        .DATA_WRITE_ADDR    (8'hFC)
    ) a_data_16x4_module(
        // Clock & Reset
        .Clock              (Clock),
        .rst_n              (rst_n),
        // UART interface
        .uart_rw            (uart_rw),
        .uart_in            (uart_data),
        .saved_data0        (a_left_in_flat[0]),
        .saved_data1        (a_left_in_flat[1]),
        .saved_data2        (a_left_in_flat[2]),
        .saved_data3        (a_left_in_flat[3])
    );

    // Data input module for matrix B
    data_16x4_module #(
        .DATA_WRITE_ADDR    (8'hFB)
    ) b_data_16x4_module(
        // Clock & Reset
        .Clock              (Clock),
        .rst_n              (rst_n),
        // UART interface
        .uart_rw            (uart_rw),
        .uart_in            (uart_data),
        .saved_data0        (b_top_in_flat[0]),
        .saved_data1        (b_top_in_flat[1]),
        .saved_data2        (b_top_in_flat[2]),
        .saved_data3        (b_top_in_flat[3])
    );

    // Data input module for PS in
    data_16x4_module #(
        .DATA_WRITE_ADDR    (8'hFA)
    ) ps_in_module(
        // Clock & Reset
        .Clock              (Clock),
        .rst_n              (rst_n),
        // UART interface
        .uart_rw            (uart_rw),
        .uart_in            (uart_data),
        .saved_data0        (ps_top_in_flat[0]),
        .saved_data1        (ps_top_in_flat[1]),
        .saved_data2        (ps_top_in_flat[2]),
        .saved_data3        (ps_top_in_flat[3])
    );


    // =================================================================
    // 2. VCD dump settings (for Icarus Verilog simulation)
    // =================================================================
    // Debugging signals
    wire [15:0] a_left_in_flat_0;
    wire [15:0] a_left_in_flat_1;
    wire [15:0] a_left_in_flat_2;
    wire [15:0] a_left_in_flat_3;
    assign a_left_in_flat_0 = a_left_in_flat[0];
    assign a_left_in_flat_1 = a_left_in_flat[1];
    assign a_left_in_flat_2 = a_left_in_flat[2];
    assign a_left_in_flat_3 = a_left_in_flat[3];

    wire [15:0] b_top_in_flat_0;
    wire [15:0] b_top_in_flat_1;
    wire [15:0] b_top_in_flat_2;
    wire [15:0] b_top_in_flat_3;
    assign b_top_in_flat_0 = b_top_in_flat[0];
    assign b_top_in_flat_1 = b_top_in_flat[1];
    assign b_top_in_flat_2 = b_top_in_flat[2];
    assign b_top_in_flat_3 = b_top_in_flat[3];

    wire [15:0] ps_bottom_out_flat_0;
    wire [15:0] ps_bottom_out_flat_1;
    wire [15:0] ps_bottom_out_flat_2;
    wire [15:0] ps_bottom_out_flat_3;
    assign ps_bottom_out_flat_0 = ps_bottom_out_flat[0];
    assign ps_bottom_out_flat_1 = ps_bottom_out_flat[1];
    assign ps_bottom_out_flat_2 = ps_bottom_out_flat[2];
    assign ps_bottom_out_flat_3 = ps_bottom_out_flat[3];

    wire [15:0] ps_top_in_flat_0;
    wire [15:0] ps_top_in_flat_1;
    wire [15:0] ps_top_in_flat_2;
    wire [15:0] ps_top_in_flat_3;
    assign ps_top_in_flat_0 = ps_top_in_flat[0];
    assign ps_top_in_flat_1 = ps_top_in_flat[1];
    assign ps_top_in_flat_2 = ps_top_in_flat[2];
    assign ps_top_in_flat_3 = ps_top_in_flat[3];

    initial begin
        $dumpfile("sa4x4.vcd");       // Output file name for VCD dump
        $dumpvars(1, SystolicArray4x4_top);     
        $dumpvars(1, SystolicArray4x4_top.u_systolic);     
        $dumpvars(1, SystolicArray4x4_top.uRV32IM);     
        $dumpvars(1, SystolicArray4x4_top.right_shift_module);    
        $dumpvars(1, SystolicArray4x4_top.b_shift_module);    
        $dumpvars(1, SystolicArray4x4_top.bottom_shift_module);    
        $dumpvars(1, SystolicArray4x4_top.a_data_16x4_module);     
        $dumpvars(1, SystolicArray4x4_top.b_data_16x4_module);   
    end

endmodule

ほぼ動作確認。

動作結果

  1. シフト動作確認済み

  2. データ入力動作確認済み

  3. RISCVへデータが戻ってることを確認済み

     0.00ns INFO     cocotb                             Seeding Python random module with 1738856207
     0.00ns INFO     cocotb.regression                  pytest not found, install it to enable better AssertionError messages
     0.00ns INFO     cocotb.regression                  Found test test_SA_riscv.test_systolic_array
     0.00ns INFO     cocotb.regression                  running test_systolic_array (1/1)
                                                          Simulation test using cocotb.
                                                          Compares the Python model's output with the Verilog (hardware) output.

=== Current PE State (Matrix Format) ===
>>> a_reg:
[
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
]

>>> b_reg:
[
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
]

VCD info: dumpfile sa4x4.vcd opened for output.

=== Current PE State (Matrix Format) ===
>>> a_reg:
[
 [1 1 1 1]
 [1 2 1 1]
 [1 1 5 1]
 [3 1 1 4]
]

>>> b_reg:
[
 [1 3 4 1]
 [8 5 2 3]
 [2 2 5 1]
 [5 1 2 7]
]

=============== Output data =======================
 50090.00ns INFO     cocotb                             === Memory Dump [0x0000 .. 0x0320] ===
 50090.00ns INFO     cocotb                             0x0000: 13 01 01 FD 93 07 C0 1A 83 A5 07 00 03 A6 47 00 
 50090.00ns INFO     cocotb                             0x0010: 83 A6 87 00 03 A7 C7 00 23 20 B1 02 23 22 C1 02 
 50090.00ns INFO     cocotb                             0x0020: 23 24 D1 02 23 26 E1 02 83 A5 07 01 03 A6 47 01 
 50090.00ns INFO     cocotb                             0x0030: 83 A6 87 01 03 A7 C7 01 23 28 B1 00 23 2A C1 00 
 50090.00ns INFO     cocotb                             0x0040: 23 2C D1 00 23 2E E1 00 03 A6 07 02 83 A6 47 02 
 50090.00ns INFO     cocotb                             0x0050: 03 A7 87 02 83 A7 C7 02 23 20 C1 00 23 22 D1 00 
 50090.00ns INFO     cocotb                             0x0060: 23 24 E1 00 23 26 F1 00 93 07 F0 0F 23 28 F0 4E 
 50090.00ns INFO     cocotb                             0x0070: 23 28 00 24 93 07 E0 0F 23 28 F0 4E 23 28 00 24 
 50090.00ns INFO     cocotb                             0x0080: 13 06 00 00 6F 00 00 05 93 06 30 00 B3 86 C6 40 
 50090.00ns INFO     cocotb                             0x0090: 93 17 27 00 93 87 07 03 B3 87 27 00 B3 87 D7 00 
 50090.00ns INFO     cocotb                             0x00A0: 83 C7 07 FD 93 F7 F7 0F 23 28 F0 4E 23 28 00 24 
 50090.00ns INFO     cocotb                             0x00B0: 23 28 00 4E 23 28 00 24 13 07 17 00 93 07 30 00 
 50090.00ns INFO     cocotb                             0x00C0: E3 D4 E7 FC 93 07 F0 0F 23 28 F0 4E 23 28 00 24 
 50090.00ns INFO     cocotb                             0x00D0: 13 06 16 00 93 07 30 00 63 CC C7 00 93 07 C0 0F 
 50090.00ns INFO     cocotb                             0x00E0: 23 28 F0 4E 23 28 00 24 13 07 00 00 6F F0 1F FD 
 50090.00ns INFO     cocotb                             0x00F0: 93 06 00 00 6F 00 00 05 93 07 30 00 B3 87 D7 40 
 50090.00ns INFO     cocotb                             0x0100: 93 97 27 00 93 87 07 03 B3 87 27 00 B3 87 E7 00 
 50090.00ns INFO     cocotb                             0x0110: 83 C7 07 FE 93 F7 F7 0F 23 28 F0 4E 23 28 00 24 
 50090.00ns INFO     cocotb                             0x0120: 23 28 00 4E 23 28 00 24 13 07 17 00 93 07 30 00 
 50090.00ns INFO     cocotb                             0x0130: E3 D4 E7 FC 93 07 E0 0F 23 28 F0 4E 23 28 00 24 
 50090.00ns INFO     cocotb                             0x0140: 93 86 16 00 93 07 30 00 63 CC D7 00 93 07 B0 0F 
 50090.00ns INFO     cocotb                             0x0150: 23 28 F0 4E 23 28 00 24 13 07 00 00 6F F0 1F FD 
 50090.00ns INFO     cocotb                             0x0160: 93 07 00 00 6F 00 40 01 13 07 D0 0F 23 28 E0 4E 
 50090.00ns INFO     cocotb                             0x0170: 23 28 00 24 93 87 17 00 13 07 30 00 E3 56 F7 FE 
 50090.00ns INFO     cocotb                             0x0180: 93 07 00 00 6F 00 C0 00 23 28 00 24 93 87 17 00 
 50090.00ns INFO     cocotb                             0x0190: 13 07 30 01 E3 5A F7 FE 03 27 00 40 83 27 40 40 
 50090.00ns INFO     cocotb                             0x01A0: 23 20 E0 30 23 28 F0 30 6F 00 00 00 01 01 01 01 
 50090.00ns INFO     cocotb                             0x01B0: 01 01 01 01 01 01 01 01 01 01 01 01 01 03 04 01 
 50090.00ns INFO     cocotb                             0x01C0: 08 05 02 03 02 02 05 01 05 01 02 07 01 01 01 01 
 50090.00ns INFO     cocotb                             0x01D0: 01 02 01 01 01 01 05 01 03 01 01 04 00 00 00 00 
 50090.00ns INFO     cocotb                             0x01E0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 
 50090.00ns INFO     cocotb                             0x01F0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 
 50090.00ns INFO     cocotb                             0x0200: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 
 50090.00ns INFO     cocotb                             0x0210: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 
 50090.00ns INFO     cocotb                             0x0220: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 
 50090.00ns INFO     cocotb                             0x0230: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 
 50090.00ns INFO     cocotb                             0x0240: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 
 50090.00ns INFO     cocotb                             0x0250: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 
 50090.00ns INFO     cocotb                             0x0260: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 
 50090.00ns INFO     cocotb                             0x0270: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 
 50090.00ns INFO     cocotb                             0x0280: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 
 50090.00ns INFO     cocotb                             0x0290: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 
 50090.00ns INFO     cocotb                             0x02A0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 
 50090.00ns INFO     cocotb                             0x02B0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 
 50090.00ns INFO     cocotb                             0x02C0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 
 50090.00ns INFO     cocotb                             0x02D0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 
 50090.00ns INFO     cocotb                             0x02E0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 
 50090.00ns INFO     cocotb                             0x02F0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 
 50090.00ns INFO     cocotb                             0x0300: 1A 00 10 00 00 00 00 00 00 00 00 00 00 00 00 00 
 50090.00ns INFO     cocotb                             0x0310: 21 00 21 00 00 00 00 00 00 00 00 00 00 00 00 00 
 50090.00ns INFO     cocotb                             0x0320: 00                                              
 50090.00ns INFO     cocotb                             ============================================
 50090.00ns INFO     cocotb.regression                  test_systolic_array passed
 50090.00ns INFO     cocotb.regression                  *******************************************************************************************
                                                        ** TEST                               STATUS  SIM TIME (ns)  REAL TIME (s)  RATIO (ns/s) **
                                                        *******************************************************************************************
                                                        ** test_SA_riscv.test_systolic_array   PASS       50090.00           1.15      43417.10  **
                                                        *******************************************************************************************
                                                        ** TESTS=1 PASS=1 FAIL=0 SKIP=0                   50090.00           1.23      40737.97  **
                                                        *******************************************************************************************
                                                        

プレテスト:8bitの場合の検証
Matrix A
[
[1 1 1 1]
[1 1 1 1]
[1 1 1 1]
[1 1 1 1]
]
Matrix B[
[1 3 4 1]
[8 5 2 3]
[2 2 5 1]
[5 5 5 7]
]

1列目 = 1+8+2+5=16 = 0x10
2列目 = 3+5+2+5=15 = 0x0F
3列目 = 4+2+5+5=16 = 0x10
3列目 = 1+3+1+7=12 = 0x0C

メモリダンプは
0x0300: 10 0F 10 0C 00 00 00 00 00 00 00 00 00 00 00 00

16bitの場合の検証
Matrix A
[
[1 1 1 1]
[1 4 1 1]
[1 1 1 0]
[2 1 1 1]
]
Matrix B[
[
[1 3 4 1]
[8 5 2 3]
[2 2 5 1]
[5 1 2 7]
]

1列目 = 1+8+2+5*2= 21= 0x15
2列目 = 3+5*4+2+1= 26= 0x1A
3列目 = 4+2+5+2= 13= 0x0D
4列目 = 1+3+0+7= 11= 0x0B

メモリダンプは
0x0300: 15 00 1A 00 00 00 00 00 00 00 00 00 00 00 00 00
0x0310: 0D 00 0B 00 00 00 00 00 00 00 00 00 00 00 00 00

ですから、最終目的のシストリックアレイではまだありませんが、動作はOKのようです。基本は完成です。あとはRISCVのソフトウェア制御ですので、Cプログラムを追記していけば、どうとでもなるでしょう。現状、ベアメタルのプログラムですので、C言語の関数などは使用できないみたいです。それでもアセンブラで記述するより、かなり楽ですが。

最終結果

シストリックアレイの計算を実行する

>>> a_reg:
[
[1 1 1 1]
[1 4 1 1]
[1 1 1 0]
[2 1 1 1]
]

>>> b_reg:
[
[1 3 4 1]
[8 5 2 3]
[2 2 5 1]
[5 1 2 7]
]

A x B の行列積は、
>>> a x b matrix:
[
[16 11 13 12]
[40 26 19 21]
[11 10 11 5]
[17 14 17 13]
]

Gtkwaveで確認すると、

bm_data0,1,2,3は
16
40,11
11,26,13
17,10,19,12
・・・合ってますね。
16進数では
0x010
0x028, 0x00B
0x00B, 0x01A, 0x00D
0x011, 0x00A, 0x013, 0x00C
→メモリダンプは
0x0300: 10 00 00 00 00 00 00 00 28 00 0B 00 00 00 00 00
0x0310: 0B 00 1A 00 0D 00 00 00 11 00 0A 00 13 00 0C 00
合ってますね!
あとは、シストリックアレイのダミーシフトを入れれば終了です。
(続く)


所感

CPU制御ですとクロックをかなり消費してしまう。遅いんですが、データの転送部分はuartではなく、DMA転送にすべきです。
これがTPUと言われるAI計算用の専用半導体の中身だそうです。本物のシリコンチップであれば縦横数百のPEを並べて並列計算するのも余裕でしょう。

いいなと思ったら応援しよう!