diff --git a/lib/Transforms/Simplification/ReduceInitialInterval.cpp b/lib/Transforms/Simplification/ReduceInitialInterval.cpp
index c9658a3..2319a02 100644
--- a/lib/Transforms/Simplification/ReduceInitialInterval.cpp
+++ b/lib/Transforms/Simplification/ReduceInitialInterval.cpp
@@ -50,7 +50,6 @@ struct ReduceInitialIntervalPattern : public OpRewritePattern<AffineForOp> {
 
     // Traverse all buffer accesses in the loop body.
     for (auto pair : map) {
-      auto buf = pair.first;
       auto accesses = pair.second;
 
       // Only if a load depends on a dominated store (a back dependence), the
diff --git a/lib/Transforms/Utils.cpp b/lib/Transforms/Utils.cpp
index 2ced7a8..11b8eac 100644
--- a/lib/Transforms/Utils.cpp
+++ b/lib/Transforms/Utils.cpp
@@ -116,6 +116,7 @@ static void addPassPipeline(PassManager &pm) {
 
   // Generic common sub expression elimination.
   pm.addPass(createCSEPass());
+  pm.addPass(createReduceInitialIntervalPass());
 }
 
 bool scalehls::applyMemoryAccessOpt(FuncOp func) {
diff --git a/samples/rosetta/spam-filter/Sgd_sw_base.cpp b/samples/rosetta/spam-filter/SgdLR_sw_base.cpp
similarity index 100%
rename from samples/rosetta/spam-filter/Sgd_sw_base.cpp
rename to samples/rosetta/spam-filter/SgdLR_sw_base.cpp
diff --git a/samples/rosetta/spam-filter/Sgd_sw_dse.cpp b/samples/rosetta/spam-filter/SgdLR_sw_dse.cpp
similarity index 65%
rename from samples/rosetta/spam-filter/Sgd_sw_dse.cpp
rename to samples/rosetta/spam-filter/SgdLR_sw_dse.cpp
index 40e70c5..6f33c42 100644
--- a/samples/rosetta/spam-filter/Sgd_sw_dse.cpp
+++ b/samples/rosetta/spam-filter/SgdLR_sw_dse.cpp
@@ -17,13 +17,13 @@
 using namespace std;
 
 /// This is top function.
-/// Latency=190822514, interval=190822514
-/// DSP=68
+/// Latency=90810014, interval=90810014
+/// DSP=69
 void SgdLR_sw(
   float v0[4608000],
   int32_t v1[4500],
   float v2[1024]
-) {	// L1, [0,190822514)
+) {	// L1, [0,90810014)
   #pragma HLS interface s_axilite port=return bundle=ctrl
   #pragma HLS interface bram port=v0
   #pragma HLS interface bram port=v1
@@ -41,56 +41,56 @@ void SgdLR_sw(
   #pragma HLS array_partition variable=v3 cyclic factor=8 dim=1
   #pragma HLS resource variable=v3 core=ram_s2p_bram
 
-  for (int v4 = 0; v4 < 5; v4 += 1) {	// L6, [0,190822512), iterCycle=38164502, II=38164502
-    for (int v5 = 0; v5 < 4500; v5 += 1) {	// L7, [0,38164502), iterCycle=8481, II=8481
+  for (int v4 = 0; v4 < 5; v4 += 1) {	// L6, [0,90810012), iterCycle=18162002, II=18162002
+    for (int v5 = 0; v5 < 4500; v5 += 1) {	// L7, [0,18162002), iterCycle=4036, II=4036
       float v6[1];	// L8, [0,0)
-      v6[0] = 0.000000;	// L9, [8134,8135)
+      v6[0] = 0.000000;	// L9, [3724,3725)
       float v7[1];	// L10, [0,0)
-      v7[0] = 0.000000;	// L11, [8175,8176)
-      for (int v8 = 0; v8 < 128; v8 += 1) {	// L12, [0,8199), iterCycle=69, II=64
-        #pragma HLS pipeline II=42
-        float v9 = v6[0];	// L13, [5,6)
-        float v10 = v2[(v8 * 8)];	// L14, [0,2)
-        float v11 = v0[((v5 * 1024) + (v8 * 8))];	// L15, [0,2)
-        float v12 = v10 * v11;	// L16, [2,6)
-        float v13 = v9 + v12;	// L17, [6,11)
-        float v14 = v2[((v8 * 8) + 1)];	// L18, [5,7)
-        float v15 = v0[(((v5 * 1024) + (v8 * 8)) + 1)];	// L19, [5,7)
-        float v16 = v14 * v15;	// L20, [7,11)
-        float v17 = v13 + v16;	// L21, [11,16)
-        float v18 = v2[((v8 * 8) + 2)];	// L22, [10,12)
-        float v19 = v0[(((v5 * 1024) + (v8 * 8)) + 2)];	// L23, [10,12)
-        float v20 = v18 * v19;	// L24, [12,16)
-        float v21 = v17 + v20;	// L25, [16,21)
-        float v22 = v2[((v8 * 8) + 3)];	// L26, [15,17)
-        float v23 = v0[(((v5 * 1024) + (v8 * 8)) + 3)];	// L27, [15,17)
-        float v24 = v22 * v23;	// L28, [17,21)
-        float v25 = v21 + v24;	// L29, [21,26)
-        float v26 = v2[((v8 * 8) + 4)];	// L30, [20,22)
-        float v27 = v0[(((v5 * 1024) + (v8 * 8)) + 4)];	// L31, [20,22)
-        float v28 = v26 * v27;	// L32, [22,26)
-        float v29 = v25 + v28;	// L33, [26,31)
-        float v30 = v2[((v8 * 8) + 5)];	// L34, [25,27)
-        float v31 = v0[(((v5 * 1024) + (v8 * 8)) + 5)];	// L35, [25,27)
-        float v32 = v30 * v31;	// L36, [27,31)
-        float v33 = v29 + v32;	// L37, [31,36)
-        float v34 = v2[((v8 * 8) + 6)];	// L38, [30,32)
-        float v35 = v0[(((v5 * 1024) + (v8 * 8)) + 6)];	// L39, [30,32)
-        float v36 = v34 * v35;	// L40, [32,36)
-        float v37 = v33 + v36;	// L41, [36,41)
-        float v38 = v2[((v8 * 8) + 7)];	// L42, [35,37)
-        float v39 = v0[(((v5 * 1024) + (v8 * 8)) + 7)];	// L43, [35,37)
-        float v40 = v38 * v39;	// L44, [37,41)
-        float v41 = v37 + v40;	// L45, [41,46)
+      v7[0] = 0.000000;	// L11, [3730,3731)
+      for (int v8 = 0; v8 < 128; v8 += 1) {	// L12, [0,3754), iterCycle=69, II=29
+        #pragma HLS pipeline II=7
+        float v9 = v2[(v8 * 8)];	// L13, [0,2)
+        float v10 = v0[((v5 * 1024) + (v8 * 8))];	// L14, [0,2)
+        float v11 = v9 * v10;	// L15, [2,6)
+        float v12 = v2[((v8 * 8) + 1)];	// L16, [0,2)
+        float v13 = v0[(((v5 * 1024) + (v8 * 8)) + 1)];	// L17, [0,2)
+        float v14 = v12 * v13;	// L18, [2,6)
+        float v15 = v11 + v14;	// L19, [6,11)
+        float v16 = v2[((v8 * 8) + 2)];	// L20, [5,7)
+        float v17 = v0[(((v5 * 1024) + (v8 * 8)) + 2)];	// L21, [5,7)
+        float v18 = v16 * v17;	// L22, [7,11)
+        float v19 = v15 + v18;	// L23, [11,16)
+        float v20 = v2[((v8 * 8) + 3)];	// L24, [10,12)
+        float v21 = v0[(((v5 * 1024) + (v8 * 8)) + 3)];	// L25, [10,12)
+        float v22 = v20 * v21;	// L26, [12,16)
+        float v23 = v19 + v22;	// L27, [16,21)
+        float v24 = v2[((v8 * 8) + 4)];	// L28, [15,17)
+        float v25 = v0[(((v5 * 1024) + (v8 * 8)) + 4)];	// L29, [15,17)
+        float v26 = v24 * v25;	// L30, [17,21)
+        float v27 = v23 + v26;	// L31, [21,26)
+        float v28 = v2[((v8 * 8) + 5)];	// L32, [20,22)
+        float v29 = v0[(((v5 * 1024) + (v8 * 8)) + 5)];	// L33, [20,22)
+        float v30 = v28 * v29;	// L34, [22,26)
+        float v31 = v27 + v30;	// L35, [26,31)
+        float v32 = v2[((v8 * 8) + 6)];	// L36, [25,27)
+        float v33 = v0[(((v5 * 1024) + (v8 * 8)) + 6)];	// L37, [25,27)
+        float v34 = v32 * v33;	// L38, [27,31)
+        float v35 = v31 + v34;	// L39, [31,36)
+        float v36 = v2[((v8 * 8) + 7)];	// L40, [30,32)
+        float v37 = v0[(((v5 * 1024) + (v8 * 8)) + 7)];	// L41, [30,32)
+        float v38 = v36 * v37;	// L42, [32,36)
+        float v39 = v35 + v38;	// L43, [36,41)
+        float v40 = v6[0];	// L44, [40,41)
+        float v41 = v40 + v39;	// L45, [41,46)
         v6[0] = v41;	// L46, [68,69)
         v7[0] = v41;	// L47, [46,47)
       }
-      float v42 = v7[0];	// L49, [8177,8178)
-      float v43 = -(v42);	// L50, [8178,8178)
-      float v44 = exp(v43);	// L51, [8178,8178)
-      float v45 = 1.000000 + v44;	// L52, [8178,8183)
-      float v46 = 1.000000 / v45;	// L53, [8183,8199)
-      for (int v47 = 0; v47 < 128; v47 += 1) {	// L54, [8199,8340), iterCycle=12, II=1
+      float v42 = v7[0];	// L49, [3732,3733)
+      float v43 = -(v42);	// L50, [3733,3733)
+      float v44 = exp(v43);	// L51, [3733,3733)
+      float v45 = 1.000000 + v44;	// L52, [3733,3738)
+      float v46 = 1.000000 / v45;	// L53, [3738,3754)
+      for (int v47 = 0; v47 < 128; v47 += 1) {	// L54, [3754,3895), iterCycle=12, II=1
         #pragma HLS pipeline II=1
         int32_t v48 = v1[v5];	// L55, [0,2)
         float v49 = v48;	// L56, [2,2)
@@ -120,7 +120,7 @@ void SgdLR_sw(
         float v66 = v50 * v65;	// L80, [7,11)
         v3[((v47 * 8) + 7)] = v66;	// L81, [11,12)
       }
-      for (int v67 = 0; v67 < 128; v67 += 1) {	// L83, [8340,8481), iterCycle=12, II=1
+      for (int v67 = 0; v67 < 128; v67 += 1) {	// L83, [3895,4036), iterCycle=12, II=1
         #pragma HLS pipeline II=1
         float v68 = v3[(v67 * 8)];	// L84, [0,2)
         float v69 = -60000.000000 * v68;	// L85, [2,6)