merge issue-628 manually

Addresses #644.
phillipstanleymarbell · Mar 5, 2023 · 50853c4 · 50853c4
1 parent 37f066b
commit 50853c4
Show file tree

Hide file tree

Showing 6 changed files with 317 additions and 18 deletions.
diff --git a/analysis/statistics/77e9b64be9282e1325602ee373eb4eb470a4495f.txt b/analysis/statistics/77e9b64be9282e1325602ee373eb4eb470a4495f.txt
@@ -0,0 +1,46 @@
+
+changeset: 1435:77e9b64be9282e1325602ee373eb4eb470a4495f
+char kNewtonVersion[] = "0.3-alpha-1435 (77e9b64be9282e1325602ee373eb4eb470a4495f) (build 03-02-2023-19:[email protected]_64)";
+\n./src/noisy/noisy-linux-EN -O0 applications/noisy/helloWorld.n -s
+\n./src/newton/newton-linux-EN -v 0 -eP applications/newton/invariants/ViolinWithTemperatureDependence-pigroups.nt
+
+Informational Report:
+---------------------
+Invariant "ViolinWithTemperatureDependenceForPiGroups" has 2 unique kernels, each with 2 column(s)...
+
+	Kernel 0 is a valid kernel:
+
+		   1   1
+		-0.5  -0
+		   1   0
+		 0.5   0
+		   0  -1
+		  -0  -1
+
+
+		The ordering of parameters is:	P1 P0 P3 P2 P4 P5 
+
+			Pi group 0, Pi 0 is:	P0^(-0.5)  P1^( 1)  P2^(0.5)  P3^( 1)  P4^( 0)  P5^(-0)  
+
+			Pi group 0, Pi 1 is:	P0^(-0)  P1^( 1)  P2^( 0)  P3^( 0)  P4^(-1)  P5^(-1)  
+
+
+	Kernel 1 is a valid kernel:
+
+		   1   0
+		-0.5   1
+		   1  -2
+		 0.5  -1
+		  -0  -2
+		   0  -2
+
+
+		The ordering of parameters is:	P1 P0 P3 P2 P4 P5 
+
+			Pi group 1, Pi 0 is:	P0^(-0.5)  P1^( 1)  P2^(0.5)  P3^( 1)  P4^(-0)  P5^( 0)  
+
+			Pi group 1, Pi 1 is:	P0^( 1)  P1^( 0)  P2^(-1)  P3^(-2)  P4^(-2)  P5^(-2)  
+
+
+
+
diff --git a/applications/newton/llvm-ir/Makefile b/applications/newton/llvm-ir/Makefile
@@ -18,7 +18,7 @@ endif
 
 all: default
 
-default: application.ll simple_control_flow.ll inferBound.ll inferBoundControlFlow.ll e_exp.ll sincosf.ll e_log.ll e_acosh.ll e_j0.ll e_y0.ll e_rem_pio2.ll benchmark_suite.ll phi_two_global_arrays.ll func_call.ll test_shift.ll
+default: application.ll simple_control_flow.ll inferBound.ll inferBoundControlFlow.ll e_exp.ll sincosf.ll e_log.ll e_acosh.ll e_j0.ll e_y0.ll e_rem_pio2.ll benchmark_suite.ll phi_two_global_arrays.ll func_call.ll test_shift.ll vec_add.ll vec_add_8.ll
 
 %.ll : %.c
 	@echo Compiling $*.c

diff --git a/applications/newton/llvm-ir/c-files/vec_add.c b/applications/newton/llvm-ir/c-files/vec_add.c
@@ -0,0 +1,80 @@
+/*
+ * compile with 'clang --target=aarch64-arm-none-eabi -O1 vec_add.c -o vec_add -fvectorize'
+ * */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <time.h>
+
+typedef struct timespec timespec;
+timespec diff(timespec start, timespec end)
+{
+    timespec temp;
+    if ((end.tv_nsec-start.tv_nsec)<0) {
+        temp.tv_sec = end.tv_sec-start.tv_sec-1;
+        temp.tv_nsec = 1000000000+end.tv_nsec-start.tv_nsec;
+    } else {
+        temp.tv_sec = end.tv_sec-start.tv_sec;
+        temp.tv_nsec = end.tv_nsec-start.tv_nsec;
+    }
+    return temp;
+}
+
+timespec sum(timespec t1, timespec t2) {
+    timespec temp;
+    if (t1.tv_nsec + t2.tv_nsec >= 1000000000) {
+        temp.tv_sec = t1.tv_sec + t2.tv_sec + 1;
+        temp.tv_nsec = t1.tv_nsec + t2.tv_nsec - 1000000000;
+    } else {
+        temp.tv_sec = t1.tv_sec + t2.tv_sec;
+        temp.tv_nsec = t1.tv_nsec + t2.tv_nsec;
+    }
+    return temp;
+}
+
+void printTimeSpec(timespec t, const char* prefix) {
+    printf("%s: %d.%09d\n", prefix, (int)t.tv_sec, (int)t.tv_nsec);
+}
+
+timespec tic( )
+{
+    timespec start_time;
+    clock_gettime(CLOCK_REALTIME, &start_time);
+    return start_time;
+}
+
+void toc( timespec* start_time, const char* prefix )
+{
+    timespec current_time;
+    clock_gettime(CLOCK_REALTIME, &current_time);
+    printTimeSpec( diff( *start_time, current_time ), prefix );
+    *start_time = current_time;
+}
+
+typedef int32_t bmx055fAcceleration;
+
+#define NUM 102400
+
+void vec_add(bmx055fAcceleration *vec_A, bmx055fAcceleration *vec_B, bmx055fAcceleration *vec_C, int len_vec) {
+    int i;
+    for (i=0; i<len_vec; i++) {
+        vec_C[i] = vec_A[i] + vec_B[i];
+    }
+}
+
+int main() {
+    int32_t x[NUM], y[NUM], z[NUM];
+    for (size_t idx = 0; idx < NUM; idx++) {
+        x[idx] = rand() % INT8_MAX;
+        y[idx] = rand() % INT8_MAX;
+    }
+    timespec timer = tic();
+    vec_add(x, y, z, NUM);
+    toc(&timer, "computation delay");
+    for (size_t idx = 0; idx < NUM; idx++) {
+        printf("value of z[%d]=%d, ", idx, z[idx]);
+    }
+    return 0;
+}
diff --git a/applications/newton/llvm-ir/c-files/vec_add_8.c b/applications/newton/llvm-ir/c-files/vec_add_8.c
@@ -0,0 +1,78 @@
+/*
+ * compile with 'clang --target=aarch64-arm-none-eabi -O1 vec_add_8.c -o vec_add_8 -fvectorize'
+ * */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <time.h>
+
+typedef struct timespec timespec;
+timespec diff(timespec start, timespec end)
+{
+    timespec temp;
+    if ((end.tv_nsec-start.tv_nsec)<0) {
+        temp.tv_sec = end.tv_sec-start.tv_sec-1;
+        temp.tv_nsec = 1000000000+end.tv_nsec-start.tv_nsec;
+    } else {
+        temp.tv_sec = end.tv_sec-start.tv_sec;
+        temp.tv_nsec = end.tv_nsec-start.tv_nsec;
+    }
+    return temp;
+}
+
+timespec sum(timespec t1, timespec t2) {
+    timespec temp;
+    if (t1.tv_nsec + t2.tv_nsec >= 1000000000) {
+        temp.tv_sec = t1.tv_sec + t2.tv_sec + 1;
+        temp.tv_nsec = t1.tv_nsec + t2.tv_nsec - 1000000000;
+    } else {
+        temp.tv_sec = t1.tv_sec + t2.tv_sec;
+        temp.tv_nsec = t1.tv_nsec + t2.tv_nsec;
+    }
+    return temp;
+}
+
+void printTimeSpec(timespec t, const char* prefix) {
+    printf("%s: %d.%09d\n", prefix, (int)t.tv_sec, (int)t.tv_nsec);
+}
+
+timespec tic( )
+{
+    timespec start_time;
+    clock_gettime(CLOCK_REALTIME, &start_time);
+    return start_time;
+}
+
+void toc( timespec* start_time, const char* prefix )
+{
+    timespec current_time;
+    clock_gettime(CLOCK_REALTIME, &current_time);
+    printTimeSpec( diff( *start_time, current_time ), prefix );
+    *start_time = current_time;
+}
+
+#define NUM 102400
+
+void vec_add(int8_t *vec_A, int8_t *vec_B, int8_t *vec_C, int len_vec) {
+    int i;
+    for (i=0; i<len_vec; i++) {
+        vec_C[i] = vec_A[i] + vec_B[i];
+    }
+}
+
+int main() {
+    int8_t x[NUM], y[NUM], z[NUM];
+    for (size_t idx = 0; idx < NUM; idx++) {
+        x[idx] = rand() % INT8_MAX;
+        y[idx] = rand() % INT8_MAX;
+    }
+    timespec timer = tic();
+    vec_add(x, y, z, NUM);
+    toc(&timer, "computation delay");
+    for (size_t idx = 0; idx < NUM; idx++) {
+        printf("value of z[%d]=%d, ", idx, z[idx]);
+    }
+    return 0;
+}
diff --git a/applications/newton/llvm-ir/c-files/vectorize_experiment.md b/applications/newton/llvm-ir/c-files/vectorize_experiment.md
@@ -0,0 +1,69 @@
+# Experiment Results of vectorization
+
+## Compile only with `Clang`
+### x86-64
+```bash
+clang -O1 vec_add.c -o vec_add # 0.000209616 s
+clang -O1 vec_add.c -o vec_add -fvectorize # 0.000157489 s
+clang -O1 vec_add_8.c -o vec_add_8 # 0.000111221 s
+clang -O1 vec_add_8.c -o vec_add_8 -fvectorize # 0.000048906 s
+```
+
+#### arm64
+```bash
+clang --target=aarch64-arm-none-eabi -O1 vec_add.c -o vec_add # 0.001143304 s
+clang --target=aarch64-arm-none-eabi -O1 vec_add.c -o vec_add -fvectorize # 0.000856311 s
+clang --target=aarch64-arm-none-eabi -O1 vec_add_8.c -o vec_add_8 # 0.000776979 s
+clang --target=aarch64-arm-none-eabi -O1 vec_add_8.c -o vec_add_8 -fvectorize # 0.000201536 s
+```
+
+## Compile with `Clang` and `opt`
+### x 86-64
+```bash
+clang -O0 -g -Xclang -disable-O0-optnone vec_add.c -S -emit-llvm -o vec_add.ll
+opt vec_add.ll --O1 -S -o vec_add_none_opt.ll
+clang vec_add_none_opt.ll -o vec_add_none_opt
+./vec_add_none_opt # 0.000328377 s
+opt vec_add.ll --O1 --loop-vectorize -S -o vec_add_opt.ll
+clang vec_add_opt.ll -o vec_add_opt
+./vec_add_opt # 0.000319101 s
+clang -O0 -g -Xclang -disable-O0-optnone vec_add_8.c -S -emit-llvm -o vec_add_8.ll
+opt vec_add_8.ll --O1 -S -o vec_add_8_none_opt.ll
+clang vec_add_8_none_opt.ll -o vec_add_8_none_opt
+./vec_add_8_none_opt # 0.000207441 s
+opt vec_add_8.ll --O1 --loop-vectorize -S -o vec_add_8_opt.ll
+clang vec_add_8_opt.ll -o vec_add_8_opt
+./vec_add_8_opt # 0.000206795 s
+```
+
+### arm64
+```bash
+clang --target=aarch64-arm-none-eabi -O0 -g -Xclang -disable-O0-optnone vec_add.c -S -emit-llvm -o vec_add.ll
+opt vec_add.ll --O1 -S -o vec_add_none_opt.ll
+clang --target=aarch64-arm-none-eabi vec_add_none_opt.ll -o vec_add_none_opt
+./vec_add_none_opt # 0.002345815 s
+opt vec_add.ll --O1 --loop-vectorize -S -o vec_add_opt.ll
+clang --target=aarch64-arm-none-eabi vec_add_opt.ll -o vec_add_opt
+./vec_add_opt # 0.000947018 s
+clang --target=aarch64-arm-none-eabi -O0 -g -Xclang -disable-O0-optnone vec_add_8.c -S -emit-llvm -o vec_add_8.ll
+opt vec_add_8.ll --O1 -S -o vec_add_8_none_opt.ll
+clang --target=aarch64-arm-none-eabi vec_add_8_none_opt.ll -o vec_add_8_none_opt
+./vec_add_8_none_opt # 0.002099071 s
+opt vec_add_8.ll --O1 --loop-vectorize -S -o vec_add_8_opt.ll
+clang --target=aarch64-arm-none-eabi vec_add_8_opt.ll -o vec_add_8_opt
+./vec_add_8_opt # 0.000227494 s
+```
+
+## Run with Newton Compiler
+```bash
+cd ../../../../src/newton
+./newton-linux-EN --llvm-ir=../../applications/newton/llvm-ir/vec_add.ll --llvm-ir-liveness-check ../../applications/newton/sensors/test.nt
+cd -
+llvm-dis ../vec_add_output.bc
+opt ../vec_add.ll --O1 --loop-vectorize -S -o vec_add_opt.ll
+clang vec_add_opt.ll -o vec_add_opt
+./vec_add_opt # 0.000318110 s
+opt ../vec_add_output.ll --O1 --loop-vectorize -S -o vec_add_output_opt.ll
+clang vec_add_output_opt.ll -o vec_add_output_opt
+./vec_add_output_opt # 0.000205080 s
+```
diff --git a/src/newton/newton-irPass-LLVMIR-shrinkTypeByRange.cpp b/src/newton/newton-irPass-LLVMIR-shrinkTypeByRange.cpp
@@ -92,11 +92,13 @@ varType
 getFloatingTypeEnum(double min, double max)
 {
 	varType finalType;
-	if ((std::abs(min) < FLT_MAX) && (std::abs(max) < FLT_MAX))
+    if ((FLT_EPSILON < std::abs(min) && std::abs(min) < FLT_MAX) &&
+        (FLT_EPSILON < std::abs(max) && std::abs(max) < FLT_MAX))
 	{
 		finalType = FLOAT;
 	}
-	else if ((std::abs(min) < DBL_MAX) && (std::abs(max) < DBL_MAX))
+    else if ((DBL_EPSILON < std::abs(min) && std::abs(min) < DBL_MAX) &&
+             (DBL_EPSILON < std::abs(max) && std::abs(max) < DBL_MAX))
 	{
 		finalType = DOUBLE;
 	}
@@ -925,6 +927,11 @@ matchDestType(State * N, Instruction * inInstruction, BasicBlock & llvmIrBasicBl
 			typeInfo backType;
 			backType.signFlag  = isSignedValue(inInstruction);
 			backType.valueType = inInstType;
+            if (isa<LoadInst>(inInstruction))
+            {
+                unsigned ptAddressSpace = srcType->getPointerAddressSpace();
+                backType.valueType	= backType.valueType->getPointerTo(ptAddressSpace);
+            }
 			for (size_t id = 0; id < inInstruction->getNumOperands(); id++)
 			{
 				auto newTypeValue = rollbackType(N, inInstruction, id, llvmIrBasicBlock, typeChangedInst, backType);
@@ -974,7 +981,13 @@ matchDestType(State * N, Instruction * inInstruction, BasicBlock & llvmIrBasicBl
 		/*
 		 * roll back operands to typeInformation.valueType
 		 * */
-		for (size_t id = 0; id < inInstruction->getNumOperands(); id++)
+        if (isa<LoadInst>(inInstruction))
+        {
+            unsigned ptAddressSpace	  = srcType->getPointerAddressSpace();
+            typeInformation.valueType = typeInformation.valueType->getPointerTo(ptAddressSpace);
+        }
+        size_t roll_backed_op_num = isa<GetElementPtrInst>(inInstruction) ? 1 : inInstruction->getNumOperands();
+        for (size_t id = 0; id < roll_backed_op_num; id++)
 		{
 			typeInfo operandPrevTypeInfo{typeInformation.valueType,
 						     isSignedValue(inInstruction->getOperand(id))};
@@ -1496,6 +1509,10 @@ mergeCast(State * N, Function & llvmIrFunction,
 			Instruction * llvmIrInstruction = &*itBB++;
 			switch (llvmIrInstruction->getOpcode())
 			{
+                case Instruction::FPToUI:
+                case Instruction::FPToSI:
+                case Instruction::SIToFP:
+                case Instruction::UIToFP:
 				case Instruction::ZExt:
 				case Instruction::SExt:
 				case Instruction::FPExt:
@@ -1540,7 +1557,23 @@ mergeCast(State * N, Function & llvmIrFunction,
 								 * */
 								Value * castInst;
 								auto	valueType = llvmIrInstruction->getType();
-								if (valueType->isIntegerTy())
+                                if ((valueType->isFloatTy() || valueType->isDoubleTy()) &&
+                                    sourceOperand->getType()->isIntegerTy())
+                                {
+                                    // float fa = (float)ia;
+                                    bool isSigned = sourceInst->getOpcode() == Instruction::SIToFP;
+                                    castInst      = isSigned ? Builder.CreateSIToFP(sourceOperand, valueType)
+                                                             : Builder.CreateUIToFP(sourceOperand, valueType);
+                                }
+                                else if (valueType->isIntegerTy() &&
+                                         (sourceOperand->getType()->isFloatTy() || sourceOperand->getType()->isDoubleTy()))
+                                {
+                                    // int iq = (int)fq;
+                                    bool isSigned = sourceInst->getOpcode() == Instruction::FPToSI;
+                                    castInst      = isSigned ? Builder.CreateFPToSI(sourceOperand, valueType)
+                                                             : Builder.CreateFPToUI(sourceOperand, valueType);
+                                }
+                                else if (valueType->isIntegerTy())
 								{
 									castInst = Builder.CreateIntCast(sourceOperand, valueType,
 													 llvmIrInstruction->getOpcode() == Instruction::SExt);
@@ -1648,6 +1681,10 @@ countCastInst(State * N, Function & llvmIrFunction)
 		{
 			switch (llvmIrInstruction.getOpcode())
 			{
+                case Instruction::FPToUI:
+                case Instruction::FPToSI:
+                case Instruction::SIToFP:
+                case Instruction::UIToFP:
 				case Instruction::ZExt:
 				case Instruction::SExt:
 				case Instruction::FPExt:
@@ -1827,19 +1864,8 @@ shrinkType(State * N, BoundInfo * boundInfo, Function & llvmIrFunction)
 	 * 1. construct instruction dependency link
 	 * 2. work with roll back strategies
 	 * */
-	std::vector<std::vector<Value *>> prevDepLink	  = getDependencyLink(N, llvmIrFunction);
-	std::map<Value *, typeInfo>	  typeChangedInst = shrinkInstType(N, boundInfo, llvmIrFunction);
-	mergeCast(N, llvmIrFunction, boundInfo->virtualRegisterRange, typeChangedInst);
-	std::vector<std::vector<Value *>> newDepLink = getDependencyLink(N, llvmIrFunction);
-
-	for (auto & depLink : newDepLink)
-	{
-		if (rollBackStrategy(N, depLink))
-		{
-			rollBackDependencyLink(N, depLink, boundInfo->virtualRegisterRange, typeChangedInst);
-		}
-	}
+    std::map<Value *, typeInfo> typeChangedInst = shrinkInstType(N, boundInfo, llvmIrFunction);
 
 	mergeCast(N, llvmIrFunction, boundInfo->virtualRegisterRange, typeChangedInst);
 }
-}
+}