计算机原理与结构 实验3 《单周期微处理器设计》
一、实验目的
1、掌握:如何采用System Verilog设计时序逻辑电路; 2、掌握:利用Quartus II / Modelsim 设计仿真CPU的基本原理和方法; 3、掌握:指令系统的设计与实现方法。
二、实验内容
用System Verilog设计一个单周期的ARM微处理器,其体系结构描述如下:
// 16 32-bit registers
// Data-processing instructions
// ADD, SUB, AND, ORR
// INSTR<cond><S> rd, rn, #immediate
// INSTR<cond><S> rd, rn, rm
// rd <- rn INSTR rm if (S) Update Status Flags
// rd <- rn INSTR immediate if (S) Update Status Flags
// Instr[31:28] = cond
// Instr[27:26] = op = 00
// Instr[25:20] = funct
// [25]: 1 for immediate, 0 for register
// [24:21]: 0100 (ADD) / 0010 (SUB) /
// 0000 (AND) / 1100 (ORR)
// [20]: S (1 = update CPSR status Flags)
// Instr[19:16] = rn
// Instr[15:12] = rd
// Instr[11:8] = 0000
// Instr[7:0] = imm8 (for #immediate type) /
// {0000,rm} (for register type)
//
// Load/Store instructions
// LDR, STR
// INSTR rd, [rn, #offset]
// LDR: rd <- Mem[rn+offset]
// STR: Mem[rn+offset] <- rd
// Instr[31:28] = cond
// Instr[27:26] = op = 01
// Instr[25:20] = funct
// [25]: 0 (A)
// [24:21]: 1100 (P/U/B/W)
// [20]: L (1 for LDR, 0 for STR)
// Instr[19:16] = rn
// Instr[15:12] = rd
// Instr[11:0] = imm12 (zero extended)
//
// Branch instruction (PC <= PC + offset, PC holds 8 bytes past Branch Instr)
// B
// B target
// PC <- PC + 8 + imm24 << 2
// Instr[31:28] = cond
// Instr[27:25] = op = 10
// Instr[25:24] = funct
// [25]: 1 (Branch)
// [24]: 0 (link)
// Instr[23:0] = imm24 (sign extend, shift left 2)
// Note: no Branch delay slot on ARM
//
// Other:
// R15 reads as PC+8
// Conditional Encoding
// cond Meaning Flag
// 0000 Equal Z = 1
// 0001 Not Equal Z = 0
// 0010 Carry Set C = 1
// 0011 Carry Clear C = 0
// 0100 Minus N = 1
// 0101 Plus N = 0
// 0110 Overflow V = 1
// 0111 No Overflow V = 0
// 1000 Unsigned Higher C = 1 & Z = 0
// 1001 Unsigned Lower/Same C = 0 | Z = 1
// 1010 Signed greater/equal N = V
// 1011 Signed less N != V
// 1100 Signed greater N = V & Z = 0
// 1101 Signed less/equal N != V | Z = 1
// 1110 Always any
其测试程序为:
// If successful, it should write the value 7 to address 100
MAIN: SUB R0, R15, R15 ; R0 = 0
ADD R2, R0, #5 ; R2 = 5
ADD R3, R0, #12 ; R3 = 12
SUB R7, R3, #9 ; R7 = 3
ORR R4, R7, R2 ; R4 = 3 OR 5 = 7
AND R5, R3, R4 ; R5 = 12 AND 7 = 4
ADD R5, R5, R4 ; R5 = 4 + 7 = 11
SUBS R8, R5, R7 ; R8 <= 11 - 3 = 8, set Flags
BEQ END ; shouldn't be taken
SUBS R8, R3, R4 ; R8 = 12 - 7 = 5
BGE AROUND ; should be taken
ADD R5, R0, #0 ; should be skipped
AROUND:
SUBS R8, R7, R2 ; R8 = 3 - 5 = -2, set Flags
ADDLT R7, R5, #1 ; R7 = 11 + 1 = 12
SUB R7, R7, R2 ; R7 = 12 - 5 = 7
STR R7, [R3, #84] ; mem[12+84] = 7
LDR R2, [R0, #96] ; R2 = mem[96] = 7
ADD R15, R15, R0 ; PC <- PC + 8 (skips next)
ADD R2, R0, #14 ; shouldn't happen
B END ; always taken
ADD R2, R0, #13 ; shouldn't happen
ADD R2, R0, #10 ; shouldn't happen
END:STR R2, [R0, #100] ; mem[100] = 7
// MAIN SUB R0, R15, R15 ; R0 = 0
1110 000 0010 0 1111 0000 0000 0000 1111 E04F000F 0x00
// ADD R2, R0, #5 ; R2 = 5
1110 001 0100 0 0000 0010 0000 0000 0101 E2802005 0x04
// ADD R3, R0, #12 ; R3 = 12
1110 001 0100 0 0000 0011 0000 0000 1100 E280300C 0x08
// SUB R7, R3, #9 ; R7 = 3
1110 001 0010 0 0011 0111 0000 0000 1001 E2437009 0x0c
// ORR R4, R7, R2 ; R4 = 3 OR 5 = 7
// 1110 000 1100 0 0111 0100 0000 0000 0010 E1874002 0x10
// AND R5, R3, R4 ; R5 = 12 AND 7 = 4
// 1110 000 0000 0 0011 0101 0000 0000 0100 E0035004 0x14
// ADD R5, R5, R4 ; R5 = 4 + 7 = 11
// 1110 000 0100 0 0101 0101 0000 0000 0100 E0855004 0x18
// SUBS R8, R5, R7 ; R8 <= 11 - 3 = 8, set Flags
// 1110 000 0010 1 0101 1000 0000 0000 0111 E0558007 0x1c
// BEQ END ; shouldn't be taken
// 0000 1010 0000 0000 0000 0000 0000 1100 0A00000C 0x20
// SUBS R8, R3, R4 ; R8 = 12 - 7 = 5
// 1110 000 0010 1 0011 1000 0000 0000 0100 E0538004 0x24
// BGE AROUND ; should be taken
// 1010 1010 0000 0000 0000 0000 0000 0000 AA000000 0x28
// ADD R5, R0, #0 ; should be skipped
// 1110 001 0100 0 0000 0101 0000 0000 0000 E2805000 0x2c
// AROUND SUBS R8, R7, R2 ; R8 = 3 - 5 = -2, set Flags
// 1110 000 0010 1 0111 1000 0000 0000 0010 E0578002 0x30
// ADDLT R7, R5, #1 ; R7 = 11 + 1 = 12
// 1011 001 0100 0 0101 0111 0000 0000 0001 B2857001 0x34
// SUB R7, R7, R2 ; R7 = 12 - 5 = 7
// 1110 000 0010 0 0111 0111 0000 0000 0010 E0477002 0x38
// STR R7, [R3, #84] ; mem[12+84] = 7
// 1110 010 1100 0 0011 0111 0000 0101 0100 E5837054 0x3c
// LDR R2, [R0, #96] ; R2 = mem[96] = 7
// 1110 010 1100 1 0000 0010 0000 0110 0000 E5902060 0x40
// ADD R15, R15, R0 ; PC <- PC + 8 (skips next)
// 1110 000 0100 0 1111 1111 0000 0000 0000 E08FF000 0x44
// ADD R2, R0, #14 ; shouldn't happen
// 1110 001 0100 0 0000 0010 0000 0000 0001 E280200E 0x48
// B END ; always taken
// 1110 1010 0000 0000 0000 0000 0000 0001 EA000001 0x4c
// ADD R2, R0, #13 ; shouldn't happen
// 1110 001 0100 0 0000 0010 0000 0000 0001 E280200D 0x50
// ADD R2, R0, #10 ; shouldn't happen
// 1110 001 0100 0 0000 0010 0000 0000 0001 E280200A 0x54
// END STR R2, [R0, #100] ; mem[100] = 7
// 1110 010 1100 0 0000 0010 0000 0101 0100 E5802064 0x58。
三、实验方法
使用System Verilog语言设计一个单周期的ARM微处理器,并通过ModelSim 10.1a进行仿真,使用相应的汇编程序对设计好的ARM微处理器进行测试。
四、实验步骤
1、新建一个ModelSim项目,添加一个System Verilog文件arm_single.sv,在其中编写单周期ARM微处理器的System Verilog代码并保存、编译,代码见附录。
2、在arm_single.sv同级目录下,新建memfile.dat文件,其中写入相关数据如E5802064、E280200A等,如下:
3、在arm_single.sv同级目录下,新建memfile.s文件,写入测试程序,如下:
4、通过ModelSim工具的Simulate>Start Simulation对写好的单周期ARM微处理器arm_single.sv进行仿真。 5、将时钟信号clk、复位信号reset,以及PC和Instr信号加入示波器Wave,并设定PC和Instr的格式为十六进制,设定仿真周期为10ns。点击Run按钮开始仿真。
五、实验结果
点击Run开始仿真之后记录仿真示波器的结果,结果是符合预期的。开始的一段:
之后:
可见实验结果是符合预期的。
六、实验结论
本次实验通过使用ModelSim 10.1a开发工具和System Verilog设计一个单周期的ARM微处理器,之后对该程序进行测试验证,结果符合预期。
七、实验小结
通过本次实验,主要是掌握了使用System Verilog语言构建一个简单的单周期ARM微处理器的方法,以及如何对构建好的ARM微处理器使用ARM汇编程序进行测试的方法。
本次实验遇到了一些问题,如一开始跟着老师给的教程开始做的时候,开始仿真之后找不到相关信号PC、Instr。后来才发现自己点击仿真后没有进行合理配置,没有选择testbench模块进行仿真。但是即使使用了testbench模块进行仿真也还是拿不到预期的那些信号,上网查阅资料后发现是因为仿真时不慎勾选了Enable Optimization选项导致对代码的优化把相关的信号错误地去除了。上述错误改正之后即得到了预期的实验结果。
ARM微处理器凭借强大的处理能力和极低的功耗,现在越来越多的公司在产品选型的时候考虑到使用ARM微处理器。另外,随着ARM功能的增强和完善,某些方面可以取代原先X86架构的单片机,特别是工控领域。基于 ARM 的处理器具有高速度、低功耗、价格低等优点被广泛应用于以下领域:为通信、消费电子、成像设备等产品,提供可运行复杂操作系统的开放应用平台;在海量存储、汽车电子、工业控制和网络应用等领域,提供实时嵌入式应用;安全系统,比如信用卡、SIM卡等。
八、附录
ARM微处理器代码如下。
// arm_single.sv
// David_Harris@hmc.edu and Sarah_Harris@hmc.edu 25 June 2013
// Single-cycle implementation of a subset of ARMv4
//
// run 210
// Expect simulator to print "Simulation succeeded"
// when the value 7 is written to address 100 (0x64)
// 16 32-bit registers
// Data-processing instructions
// ADD, SUB, AND, ORR
// INSTR<cond><S> rd, rn, #immediate
// INSTR<cond><S> rd, rn, rm
// rd <- rn INSTR rm if (S) Update Status Flags
// rd <- rn INSTR immediate if (S) Update Status Flags
// Instr[31:28] = cond
// Instr[27:26] = op = 00
// Instr[25:20] = funct
// [25]: 1 for immediate, 0 for register
// [24:21]: 0100 (ADD) / 0010 (SUB) /
// 0000 (AND) / 1100 (ORR)
// [20]: S (1 = update CPSR status Flags)
// Instr[19:16] = rn
// Instr[15:12] = rd
// Instr[11:8] = 0000
// Instr[7:0] = imm8 (for #immediate type) /
// {0000,rm} (for register type)
//
// Load/Store instructions
// LDR, STR
// INSTR rd, [rn, #offset]
// LDR: rd <- Mem[rn+offset]
// STR: Mem[rn+offset] <- rd
// Instr[31:28] = cond
// Instr[27:26] = op = 01
// Instr[25:20] = funct
// [25]: 0 (A)
// [24:21]: 1100 (P/U/B/W)
// [20]: L (1 for LDR, 0 for STR)
// Instr[19:16] = rn
// Instr[15:12] = rd
// Instr[11:0] = imm12 (zero extended)
//
// Branch instruction (PC <= PC + offset, PC holds 8 bytes past Branch Instr)
// B
// B target
// PC <- PC + 8 + imm24 << 2
// Instr[31:28] = cond
// Instr[27:25] = op = 10
// Instr[25:24] = funct
// [25]: 1 (Branch)
// [24]: 0 (link)
// Instr[23:0] = imm24 (sign extend, shift left 2)
// Note: no Branch delay slot on ARM
//
// Other:
// R15 reads as PC+8
// Conditional Encoding
// cond Meaning Flag
// 0000 Equal Z = 1
// 0001 Not Equal Z = 0
// 0010 Carry Set C = 1
// 0011 Carry Clear C = 0
// 0100 Minus N = 1
// 0101 Plus N = 0
// 0110 Overflow V = 1
// 0111 No Overflow V = 0
// 1000 Unsigned Higher C = 1 & Z = 0
// 1001 Unsigned Lower/Same C = 0 | Z = 1
// 1010 Signed greater/equal N = V
// 1011 Signed less N != V
// 1100 Signed greater N = V & Z = 0
// 1101 Signed less/equal N != V | Z = 1
// 1110 Always any
module testbench();
logic clk;
logic reset;
logic [31:0] WriteData, DataAdr;
logic MemWrite;
// instantiate device to be tested
top dut(clk, reset, WriteData, DataAdr, MemWrite);
// initialize test
initial
begin
reset <= 1; # 22; reset <= 0;
end
// generate clock to sequence tests
always
begin
clk <= 1; # 5; clk <= 0; # 5;
end
// check results
always @(negedge clk)
begin
if(MemWrite) begin
if(DataAdr === 100 & WriteData === 7) begin
$display("Simulation succeeded");
$stop;
end else if (DataAdr !== 96) begin
$display("Simulation failed");
$stop;
end
end
end
endmodule
module top(input logic clk, reset,
output logic [31:0] WriteData, DataAdr,
output logic MemWrite);
logic [31:0] PC, Instr, ReadData;
// instantiate processor and memories
arm arm(clk, reset, PC, Instr, MemWrite, DataAdr,
WriteData, ReadData);
imem imem(PC, Instr);
dmem dmem(clk, MemWrite, DataAdr, WriteData, ReadData);
endmodule
module dmem(input logic clk, we,
input logic [31:0] a, wd,
output logic [31:0] rd);
logic [31:0] RAM[63:0];
assign rd = RAM[a[31:2]]; // word aligned
always_ff @(posedge clk)
if (we) RAM[a[31:2]] <= wd;
endmodule
module imem(input logic [31:0] a,
output logic [31:0] rd);
logic [31:0] RAM[63:0];
initial
$readmemh("memfile.dat",RAM);
assign rd = RAM[a[31:2]]; // word aligned
endmodule
module arm(input logic clk, reset,
output logic [31:0] PC,
input logic [31:0] Instr,
output logic MemWrite,
output logic [31:0] ALUResult, WriteData,
input logic [31:0] ReadData);
logic [3:0] ALUFlags;
logic RegWrite,
ALUSrc, MemtoReg, PCSrc;
logic [1:0] RegSrc, ImmSrc, ALUControl;
controller c(clk, reset, Instr[31:12], ALUFlags,
RegSrc, RegWrite, ImmSrc,
ALUSrc, ALUControl,
MemWrite, MemtoReg, PCSrc);
datapath dp(clk, reset,
RegSrc, RegWrite, ImmSrc,
ALUSrc, ALUControl,
MemtoReg, PCSrc,
ALUFlags, PC, Instr,
ALUResult, WriteData, ReadData);
endmodule
module controller(input logic clk, reset,
input logic [31:12] Instr,
input logic [3:0] ALUFlags,
output logic [1:0] RegSrc,
output logic RegWrite,
output logic [1:0] ImmSrc,
output logic ALUSrc,
output logic [1:0] ALUControl,
output logic MemWrite, MemtoReg,
output logic PCSrc);
logic [1:0] FlagW;
logic PCS, RegW, MemW;
decoder dec(Instr[27:26], Instr[25:20], Instr[15:12],
FlagW, PCS, RegW, MemW,
MemtoReg, ALUSrc, ImmSrc, RegSrc, ALUControl);
condlogic cl(clk, reset, Instr[31:28], ALUFlags,
FlagW, PCS, RegW, MemW,
PCSrc, RegWrite, MemWrite);
endmodule
module decoder(input logic [1:0] Op,
input logic [5:0] Funct,
input logic [3:0] Rd,
output logic [1:0] FlagW,
output logic PCS, RegW, MemW,
output logic MemtoReg, ALUSrc,
output logic [1:0] ImmSrc, RegSrc, ALUControl);
logic [9:0] controls;
logic Branch, ALUOp;
// Main Decoder
always_comb
case(Op)
// Data processing immediate
2'b00: if (Funct[5]) controls = 10'b0000101001;
// Data processing register
else controls = 10'b0000001001;
// LDR
2'b01: if (Funct[0]) controls = 10'b0001111000;
// STR
else controls = 10'b1001110100;
// B
2'b10: controls = 10'b0110100010;
// Unimplemented
default: controls = 10'bx;
endcase
assign {RegSrc, ImmSrc, ALUSrc, MemtoReg,
RegW, MemW, Branch, ALUOp} = controls;
// ALU Decoder
always_comb
if (ALUOp) begin // which DP Instr?
case(Funct[4:1])
4'b0100: ALUControl = 2'b00; // ADD
4'b0010: ALUControl = 2'b01; // SUB
4'b0000: ALUControl = 2'b10; // AND
4'b1100: ALUControl = 2'b11; // ORR
default: ALUControl = 2'bx; // unimplemented
endcase
// update flags if S bit is set
// (C & V only updated for arith instructions)
FlagW[1] = Funct[0]; // FlagW[1] = S-bit
// FlagW[0] = S-bit & (ADD | SUB)
FlagW[0] = Funct[0] &
(ALUControl == 2'b00 | ALUControl == 2'b01);
end else begin
ALUControl = 2'b00; // add for non-DP instructions
FlagW = 2'b00; // don't update Flags
end
// PC Logic
assign PCS = ((Rd == 4'b1111) & RegW) | Branch;
endmodule
module condlogic(input logic clk, reset,
input logic [3:0] Cond,
input logic [3:0] ALUFlags,
input logic [1:0] FlagW,
input logic PCS, RegW, MemW,
output logic PCSrc, RegWrite, MemWrite);
logic [1:0] FlagWrite;
logic [3:0] Flags;
logic CondEx;
flopenr #(2)flagreg1(clk, reset, FlagWrite[1],
ALUFlags[3:2], Flags[3:2]);
flopenr #(2)flagreg0(clk, reset, FlagWrite[0],
ALUFlags[1:0], Flags[1:0]);
// write controls are conditional
condcheck cc(Cond, Flags, CondEx);
assign FlagWrite = FlagW & {2{CondEx}};
assign RegWrite = RegW & CondEx;
assign MemWrite = MemW & CondEx;
assign PCSrc = PCS & CondEx;
endmodule
module condcheck(input logic [3:0] Cond,
input logic [3:0] Flags,
output logic CondEx);
logic neg, zero, carry, overflow, ge;
assign {neg, zero, carry, overflow} = Flags;
assign ge = (neg == overflow);
always_comb
case(Cond)
4'b0000: CondEx = zero; // EQ
4'b0001: CondEx = ~zero; // NE
4'b0010: CondEx = carry; // CS
4'b0011: CondEx = ~carry; // CC
4'b0100: CondEx = neg; // MI
4'b0101: CondEx = ~neg; // PL
4'b0110: CondEx = overflow; // VS
4'b0111: CondEx = ~overflow; // VC
4'b1000: CondEx = carry & ~zero; // HI
4'b1001: CondEx = ~(carry & ~zero); // LS
4'b1010: CondEx = ge; // GE
4'b1011: CondEx = ~ge; // LT
4'b1100: CondEx = ~zero & ge; // GT
4'b1101: CondEx = ~(~zero & ge); // LE
4'b1110: CondEx = 1'b1; // Always
default: CondEx = 1'bx; // undefined
endcase
endmodule
module datapath(input logic clk, reset,
input logic [1:0] RegSrc,
input logic RegWrite,
input logic [1:0] ImmSrc,
input logic ALUSrc,
input logic [1:0] ALUControl,
input logic MemtoReg,
input logic PCSrc,
output logic [3:0] ALUFlags,
output logic [31:0] PC,
input logic [31:0] Instr,
output logic [31:0] ALUResult, WriteData,
input logic [31:0] ReadData);
logic [31:0] PCNext, PCPlus4, PCPlus8;
logic [31:0] ExtImm, SrcA, SrcB, Result;
logic [3:0] RA1, RA2;
// next PC logic
mux2 #(32) pcmux(PCPlus4, Result, PCSrc, PCNext);
flopr #(32) pcreg(clk, reset, PCNext, PC);
adder #(32) pcadd1(PC, 32'b100, PCPlus4);
adder #(32) pcadd2(PCPlus4, 32'b100, PCPlus8);
// register file logic
mux2 #(4) ra1mux(Instr[19:16], 4'b1111, RegSrc[0], RA1);
mux2 #(4) ra2mux(Instr[3:0], Instr[15:12], RegSrc[1], RA2);
regfile rf(clk, RegWrite, RA1, RA2,
Instr[15:12], Result, PCPlus8,
SrcA, WriteData);
mux2 #(32) resmux(ALUResult, ReadData, MemtoReg, Result);
extend ext(Instr[23:0], ImmSrc, ExtImm);
// ALU logic
mux2 #(32) srcbmux(WriteData, ExtImm, ALUSrc, SrcB);
alu alu(SrcA, SrcB, ALUControl,
ALUResult, ALUFlags);
endmodule
module regfile(input logic clk,
input logic we3,
input logic [3:0] ra1, ra2, wa3,
input logic [31:0] wd3, r15,
output logic [31:0] rd1, rd2);
logic [31:0] rf[14:0];
// three ported register file
// read two ports combinationally
// write third port on rising edge of clock
// register 15 reads PC+8 instead
always_ff @(posedge clk)
if (we3) rf[wa3] <= wd3;
assign rd1 = (ra1 == 4'b1111) ? r15 : rf[ra1];
assign rd2 = (ra2 == 4'b1111) ? r15 : rf[ra2];
endmodule
module extend(input logic [23:0] Instr,
input logic [1:0] ImmSrc,
output logic [31:0] ExtImm);
always_comb
case(ImmSrc)
// 8-bit unsigned immediate
2'b00: ExtImm = {24'b0, Instr[7:0]};
// 12-bit unsigned immediate
2'b01: ExtImm = {20'b0, Instr[11:0]};
// 24-bit two's complement shifted branch
2'b10: ExtImm = {{6{Instr[23]}}, Instr[23:0], 2'b00};
default: ExtImm = 32'bx; // undefined
endcase
endmodule
module adder #(parameter WIDTH=8)
(input logic [WIDTH-1:0] a, b,
output logic [WIDTH-1:0] y);
assign y = a + b;
endmodule
module flopenr #(parameter WIDTH = 8)
(input logic clk, reset, en,
input logic [WIDTH-1:0] d,
output logic [WIDTH-1:0] q);
always_ff @(posedge clk, posedge reset)
if (reset) q <= 0;
else if (en) q <= d;
endmodule
module flopr #(parameter WIDTH = 8)
(input logic clk, reset,
input logic [WIDTH-1:0] d,
output logic [WIDTH-1:0] q);
always_ff @(posedge clk, posedge reset)
if (reset) q <= 0;
else q <= d;
endmodule
module mux2 #(parameter WIDTH = 8)
(input logic [WIDTH-1:0] d0, d1,
input logic s,
output logic [WIDTH-1:0] y);
assign y = s ? d1 : d0;
endmodule
module alu(input logic [31:0] a, b,
input logic [1:0] ALUControl,
output logic [31:0] Result,
output logic [3:0] ALUFlags);
logic neg, zero, carry, overflow;
logic [31:0] condinvb;
logic [32:0] sum;
assign condinvb = ALUControl[0] ? ~b : b;
assign sum = a + condinvb + ALUControl[0];
always_comb
casex (ALUControl[1:0])
2'b0?: Result = sum;
2'b10: Result = a & b;
2'b11: Result = a | b;
endcase
assign neg = Result[31];
assign zero = (Result == 32'b0);
assign carry = (ALUControl[1] == 1'b0) & sum[32];
assign overflow = (ALUControl[1] == 1'b0) &
~(a[31] ^ b[31] ^ ALUControl[0]) &
(a[31] ^ sum[31]);
assign ALUFlags = {neg, zero, carry, overflow};
endmodule
|