
シストリックアレイを作ってみよう【完成編】
背景
Systolic array、RISC-V、開発環境
組み上げて完成させよう。
全体図
このような構成です。

コード
作成開始
https://github.com/rmbmp717/SystolicArray/
ディレクトリ構成
.
├── Python_model
├── RISCV
| |── RV32IM_FPGA_PIPELINE.v
| |── RV32IM_FPGA_PIPELINE_SUP.v
├── Verilog
|── SA4x4
| |── SystolicArray4x4_top.v
| |── SystolicArray4x4.v
├── SA8x8
| |── SystolicArray8x8_top.v
| |── SystolicArray8x8.v
├── c_program
|── SA1.c
├── SA2.c
├── FPGA
先々週にcocotbでの制御信号コードは作成済み。制御信号をRISCVを用いて制御すればいいと。
トップモジュールは下。
make実行で
Cプログラムをコンパイル
hexコードに変換
RiscVプロセッサ起動
SAにデータセット
シフト動作
データをRiscVに戻す。
を実行していきます。
下がトップモジュールのコードです。uartにシフトパルス制御モジュールとA Matrixデータ入力モジュール、B Matrixデータ入力モジュールが接続されてます。
`timescale 1ns / 1ps
module SystolicArray4x4_top (
// Ports from the upper module
input wire Clock,
input wire rst_n,
input wire data_clear,
input wire en_b_shift_bottom,
input wire en_shift_right,
input wire en_shift_bottom,
input wire [15:0] a_left_in_flat [0:3],
input wire [15:0] b_top_in_flat [0:3],
input wire [15:0] ps_top_in_flat [0:3],
output wire [15:0] ps_bottom_out_flat [0:3]
);
wire uart_rw;
wire [7:0] uart_data;
// =================================================================
// 1. Instantiate the submodule (SystolicArray4x4)
// =================================================================
SystolicArray4x4 u_systolic (
.Clock (Clock),
.rst_n (rst_n),
.data_clear (data_clear),
.en_b_shift_bottom (en_b_shift_bottom),
.en_shift_right (en_shift_right),
.en_shift_bottom (en_shift_bottom),
.a_left_in_flat (a_left_in_flat),
.b_top_in_flat (b_top_in_flat),
.ps_top_in_flat (ps_top_in_flat),
.ps_bottom_out_flat (ps_bottom_out_flat)
);
wire [15:0] bm_data0;
wire [15:0] bm_data1;
wire [15:0] bm_data2;
wire [15:0] bm_data3;
assign bm_data0 = ps_bottom_out_flat[0];
assign bm_data1 = ps_bottom_out_flat[1];
assign bm_data2 = ps_bottom_out_flat[2];
assign bm_data3 = ps_bottom_out_flat[3];
wire [31:0] dma_in_data0;
wire [31:0] dma_in_data1;
assign dma_in_data0 = {{bm_data1}, {bm_data0}};
assign dma_in_data1 = {{bm_data3}, {bm_data2}};
// RISC-V processor instance
RV32IM uRV32IM(
// Clock & Reset
.clock (Clock),
.reset_n (rst_n),
.uart_out ({{uart_rw}, {uart_data}}),
.DMA_in0 (dma_in_data0),
.DMA_in1 (dma_in_data1)
);
// Shift enable for right shift
shift_module #(
.EN_SHIFT_ADDR (8'hFF)
) right_shift_module(
// Clock & Reset
.Clock (Clock),
.rst_n (rst_n),
// UART interface
.uart_rw (uart_rw),
.uart_in (uart_data),
.shift (en_shift_right)
);
// Shift enable for b data shift
shift_module #(
.EN_SHIFT_ADDR (8'hFE)
) b_shift_module(
// Clock & Reset
.Clock (Clock),
.rst_n (rst_n),
// UART interface
.uart_rw (uart_rw),
.uart_in (uart_data),
.shift (en_b_shift_bottom)
);
// Shift enable for bottom shift
shift_module #(
.EN_SHIFT_ADDR (8'hFD)
) bottom_shift_module(
// Clock & Reset
.Clock (Clock),
.rst_n (rst_n),
// UART interface
.uart_rw (uart_rw),
.uart_in (uart_data),
.shift (en_shift_bottom)
);
// Data input module for matrix A
data_16x4_module #(
.DATA_WRITE_ADDR (8'hFC)
) a_data_16x4_module(
// Clock & Reset
.Clock (Clock),
.rst_n (rst_n),
// UART interface
.uart_rw (uart_rw),
.uart_in (uart_data),
.saved_data0 (a_left_in_flat[0]),
.saved_data1 (a_left_in_flat[1]),
.saved_data2 (a_left_in_flat[2]),
.saved_data3 (a_left_in_flat[3])
);
// Data input module for matrix B
data_16x4_module #(
.DATA_WRITE_ADDR (8'hFB)
) b_data_16x4_module(
// Clock & Reset
.Clock (Clock),
.rst_n (rst_n),
// UART interface
.uart_rw (uart_rw),
.uart_in (uart_data),
.saved_data0 (b_top_in_flat[0]),
.saved_data1 (b_top_in_flat[1]),
.saved_data2 (b_top_in_flat[2]),
.saved_data3 (b_top_in_flat[3])
);
// Data input module for PS in
data_16x4_module #(
.DATA_WRITE_ADDR (8'hFA)
) ps_in_module(
// Clock & Reset
.Clock (Clock),
.rst_n (rst_n),
// UART interface
.uart_rw (uart_rw),
.uart_in (uart_data),
.saved_data0 (ps_top_in_flat[0]),
.saved_data1 (ps_top_in_flat[1]),
.saved_data2 (ps_top_in_flat[2]),
.saved_data3 (ps_top_in_flat[3])
);
// =================================================================
// 2. VCD dump settings (for Icarus Verilog simulation)
// =================================================================
// Debugging signals
wire [15:0] a_left_in_flat_0;
wire [15:0] a_left_in_flat_1;
wire [15:0] a_left_in_flat_2;
wire [15:0] a_left_in_flat_3;
assign a_left_in_flat_0 = a_left_in_flat[0];
assign a_left_in_flat_1 = a_left_in_flat[1];
assign a_left_in_flat_2 = a_left_in_flat[2];
assign a_left_in_flat_3 = a_left_in_flat[3];
wire [15:0] b_top_in_flat_0;
wire [15:0] b_top_in_flat_1;
wire [15:0] b_top_in_flat_2;
wire [15:0] b_top_in_flat_3;
assign b_top_in_flat_0 = b_top_in_flat[0];
assign b_top_in_flat_1 = b_top_in_flat[1];
assign b_top_in_flat_2 = b_top_in_flat[2];
assign b_top_in_flat_3 = b_top_in_flat[3];
wire [15:0] ps_bottom_out_flat_0;
wire [15:0] ps_bottom_out_flat_1;
wire [15:0] ps_bottom_out_flat_2;
wire [15:0] ps_bottom_out_flat_3;
assign ps_bottom_out_flat_0 = ps_bottom_out_flat[0];
assign ps_bottom_out_flat_1 = ps_bottom_out_flat[1];
assign ps_bottom_out_flat_2 = ps_bottom_out_flat[2];
assign ps_bottom_out_flat_3 = ps_bottom_out_flat[3];
wire [15:0] ps_top_in_flat_0;
wire [15:0] ps_top_in_flat_1;
wire [15:0] ps_top_in_flat_2;
wire [15:0] ps_top_in_flat_3;
assign ps_top_in_flat_0 = ps_top_in_flat[0];
assign ps_top_in_flat_1 = ps_top_in_flat[1];
assign ps_top_in_flat_2 = ps_top_in_flat[2];
assign ps_top_in_flat_3 = ps_top_in_flat[3];
initial begin
$dumpfile("sa4x4.vcd"); // Output file name for VCD dump
$dumpvars(1, SystolicArray4x4_top);
$dumpvars(1, SystolicArray4x4_top.u_systolic);
$dumpvars(1, SystolicArray4x4_top.uRV32IM);
$dumpvars(1, SystolicArray4x4_top.right_shift_module);
$dumpvars(1, SystolicArray4x4_top.b_shift_module);
$dumpvars(1, SystolicArray4x4_top.bottom_shift_module);
$dumpvars(1, SystolicArray4x4_top.a_data_16x4_module);
$dumpvars(1, SystolicArray4x4_top.b_data_16x4_module);
end
endmodule
ほぼ動作確認。
動作結果
シフト動作確認済み
データ入力動作確認済み
RISCVへデータが戻ってることを確認済み
0.00ns INFO cocotb Seeding Python random module with 1738856207
0.00ns INFO cocotb.regression pytest not found, install it to enable better AssertionError messages
0.00ns INFO cocotb.regression Found test test_SA_riscv.test_systolic_array
0.00ns INFO cocotb.regression running test_systolic_array (1/1)
Simulation test using cocotb.
Compares the Python model's output with the Verilog (hardware) output.
=== Current PE State (Matrix Format) ===
>>> a_reg:
[
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]
]
>>> b_reg:
[
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]
]
VCD info: dumpfile sa4x4.vcd opened for output.
=== Current PE State (Matrix Format) ===
>>> a_reg:
[
[1 1 1 1]
[1 2 1 1]
[1 1 5 1]
[3 1 1 4]
]
>>> b_reg:
[
[1 3 4 1]
[8 5 2 3]
[2 2 5 1]
[5 1 2 7]
]
=============== Output data =======================
50090.00ns INFO cocotb === Memory Dump [0x0000 .. 0x0320] ===
50090.00ns INFO cocotb 0x0000: 13 01 01 FD 93 07 C0 1A 83 A5 07 00 03 A6 47 00
50090.00ns INFO cocotb 0x0010: 83 A6 87 00 03 A7 C7 00 23 20 B1 02 23 22 C1 02
50090.00ns INFO cocotb 0x0020: 23 24 D1 02 23 26 E1 02 83 A5 07 01 03 A6 47 01
50090.00ns INFO cocotb 0x0030: 83 A6 87 01 03 A7 C7 01 23 28 B1 00 23 2A C1 00
50090.00ns INFO cocotb 0x0040: 23 2C D1 00 23 2E E1 00 03 A6 07 02 83 A6 47 02
50090.00ns INFO cocotb 0x0050: 03 A7 87 02 83 A7 C7 02 23 20 C1 00 23 22 D1 00
50090.00ns INFO cocotb 0x0060: 23 24 E1 00 23 26 F1 00 93 07 F0 0F 23 28 F0 4E
50090.00ns INFO cocotb 0x0070: 23 28 00 24 93 07 E0 0F 23 28 F0 4E 23 28 00 24
50090.00ns INFO cocotb 0x0080: 13 06 00 00 6F 00 00 05 93 06 30 00 B3 86 C6 40
50090.00ns INFO cocotb 0x0090: 93 17 27 00 93 87 07 03 B3 87 27 00 B3 87 D7 00
50090.00ns INFO cocotb 0x00A0: 83 C7 07 FD 93 F7 F7 0F 23 28 F0 4E 23 28 00 24
50090.00ns INFO cocotb 0x00B0: 23 28 00 4E 23 28 00 24 13 07 17 00 93 07 30 00
50090.00ns INFO cocotb 0x00C0: E3 D4 E7 FC 93 07 F0 0F 23 28 F0 4E 23 28 00 24
50090.00ns INFO cocotb 0x00D0: 13 06 16 00 93 07 30 00 63 CC C7 00 93 07 C0 0F
50090.00ns INFO cocotb 0x00E0: 23 28 F0 4E 23 28 00 24 13 07 00 00 6F F0 1F FD
50090.00ns INFO cocotb 0x00F0: 93 06 00 00 6F 00 00 05 93 07 30 00 B3 87 D7 40
50090.00ns INFO cocotb 0x0100: 93 97 27 00 93 87 07 03 B3 87 27 00 B3 87 E7 00
50090.00ns INFO cocotb 0x0110: 83 C7 07 FE 93 F7 F7 0F 23 28 F0 4E 23 28 00 24
50090.00ns INFO cocotb 0x0120: 23 28 00 4E 23 28 00 24 13 07 17 00 93 07 30 00
50090.00ns INFO cocotb 0x0130: E3 D4 E7 FC 93 07 E0 0F 23 28 F0 4E 23 28 00 24
50090.00ns INFO cocotb 0x0140: 93 86 16 00 93 07 30 00 63 CC D7 00 93 07 B0 0F
50090.00ns INFO cocotb 0x0150: 23 28 F0 4E 23 28 00 24 13 07 00 00 6F F0 1F FD
50090.00ns INFO cocotb 0x0160: 93 07 00 00 6F 00 40 01 13 07 D0 0F 23 28 E0 4E
50090.00ns INFO cocotb 0x0170: 23 28 00 24 93 87 17 00 13 07 30 00 E3 56 F7 FE
50090.00ns INFO cocotb 0x0180: 93 07 00 00 6F 00 C0 00 23 28 00 24 93 87 17 00
50090.00ns INFO cocotb 0x0190: 13 07 30 01 E3 5A F7 FE 03 27 00 40 83 27 40 40
50090.00ns INFO cocotb 0x01A0: 23 20 E0 30 23 28 F0 30 6F 00 00 00 01 01 01 01
50090.00ns INFO cocotb 0x01B0: 01 01 01 01 01 01 01 01 01 01 01 01 01 03 04 01
50090.00ns INFO cocotb 0x01C0: 08 05 02 03 02 02 05 01 05 01 02 07 01 01 01 01
50090.00ns INFO cocotb 0x01D0: 01 02 01 01 01 01 05 01 03 01 01 04 00 00 00 00
50090.00ns INFO cocotb 0x01E0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
50090.00ns INFO cocotb 0x01F0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
50090.00ns INFO cocotb 0x0200: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
50090.00ns INFO cocotb 0x0210: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
50090.00ns INFO cocotb 0x0220: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
50090.00ns INFO cocotb 0x0230: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
50090.00ns INFO cocotb 0x0240: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
50090.00ns INFO cocotb 0x0250: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
50090.00ns INFO cocotb 0x0260: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
50090.00ns INFO cocotb 0x0270: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
50090.00ns INFO cocotb 0x0280: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
50090.00ns INFO cocotb 0x0290: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
50090.00ns INFO cocotb 0x02A0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
50090.00ns INFO cocotb 0x02B0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
50090.00ns INFO cocotb 0x02C0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
50090.00ns INFO cocotb 0x02D0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
50090.00ns INFO cocotb 0x02E0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
50090.00ns INFO cocotb 0x02F0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
50090.00ns INFO cocotb 0x0300: 1A 00 10 00 00 00 00 00 00 00 00 00 00 00 00 00
50090.00ns INFO cocotb 0x0310: 21 00 21 00 00 00 00 00 00 00 00 00 00 00 00 00
50090.00ns INFO cocotb 0x0320: 00
50090.00ns INFO cocotb ============================================
50090.00ns INFO cocotb.regression test_systolic_array passed
50090.00ns INFO cocotb.regression *******************************************************************************************
** TEST STATUS SIM TIME (ns) REAL TIME (s) RATIO (ns/s) **
*******************************************************************************************
** test_SA_riscv.test_systolic_array PASS 50090.00 1.15 43417.10 **
*******************************************************************************************
** TESTS=1 PASS=1 FAIL=0 SKIP=0 50090.00 1.23 40737.97 **
*******************************************************************************************
プレテスト:8bitの場合の検証
Matrix A
[
[1 1 1 1]
[1 1 1 1]
[1 1 1 1]
[1 1 1 1]
]
Matrix B[
[1 3 4 1]
[8 5 2 3]
[2 2 5 1]
[5 5 5 7]
]
1列目 = 1+8+2+5=16 = 0x10
2列目 = 3+5+2+5=15 = 0x0F
3列目 = 4+2+5+5=16 = 0x10
3列目 = 1+3+1+7=12 = 0x0C
メモリダンプは
0x0300: 10 0F 10 0C 00 00 00 00 00 00 00 00 00 00 00 00
16bitの場合の検証
Matrix A
[
[1 1 1 1]
[1 4 1 1]
[1 1 1 0]
[2 1 1 1]
]
Matrix B[
[
[1 3 4 1]
[8 5 2 3]
[2 2 5 1]
[5 1 2 7]
]
1列目 = 1+8+2+5*2= 21= 0x15
2列目 = 3+5*4+2+1= 26= 0x1A
3列目 = 4+2+5+2= 13= 0x0D
4列目 = 1+3+0+7= 11= 0x0B
メモリダンプは
0x0300: 15 00 1A 00 00 00 00 00 00 00 00 00 00 00 00 00
0x0310: 0D 00 0B 00 00 00 00 00 00 00 00 00 00 00 00 00
ですから、最終目的のシストリックアレイではまだありませんが、動作はOKのようです。基本は完成です。あとはRISCVのソフトウェア制御ですので、Cプログラムを追記していけば、どうとでもなるでしょう。現状、ベアメタルのプログラムですので、C言語の関数などは使用できないみたいです。それでもアセンブラで記述するより、かなり楽ですが。
最終結果
シストリックアレイの計算を実行する
>>> a_reg:
[
[1 1 1 1]
[1 4 1 1]
[1 1 1 0]
[2 1 1 1]
]
>>> b_reg:
[
[1 3 4 1]
[8 5 2 3]
[2 2 5 1]
[5 1 2 7]
]
A x B の行列積は、
>>> a x b matrix:
[
[16 11 13 12]
[40 26 19 21]
[11 10 11 5]
[17 14 17 13]
]
Gtkwaveで確認すると、

bm_data0,1,2,3は
16
40,11
11,26,13
17,10,19,12
・・・合ってますね。
16進数では
0x010
0x028, 0x00B
0x00B, 0x01A, 0x00D
0x011, 0x00A, 0x013, 0x00C
→メモリダンプは
0x0300: 10 00 00 00 00 00 00 00 28 00 0B 00 00 00 00 00
0x0310: 0B 00 1A 00 0D 00 00 00 11 00 0A 00 13 00 0C 00
→合ってますね!
あとは、シストリックアレイのダミーシフトを入れれば終了です。
(続く)
所感
CPU制御ですとクロックをかなり消費してしまう。遅いんですが、データの転送部分はuartではなく、DMA転送にすべきです。
これがTPUと言われるAI計算用の専用半導体の中身だそうです。本物のシリコンチップであれば縦横数百のPEを並べて並列計算するのも余裕でしょう。