Flush the instruction cache after loading JIT compiled code.

x86 does not strictly require this because it performs snooping to ensure
consistency, but it is essential on ARM and some other architectures.

Bug b/37478805

Change-Id: I9fad94571ec65b67132ba40c3e1814c63d6af468
Reviewed-on: https://swiftshader-review.googlesource.com/9429
Reviewed-by: Nicolas Capens <capn@google.com>
Tested-by: Nicolas Capens <capn@google.com>
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index b4a7db2..d1464a5 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -289,7 +289,7 @@
 		return symbolValue;
 	}
 
-	void *loadImage(uint8_t *const elfImage)
+	void *loadImage(uint8_t *const elfImage, size_t &codeSize)
 	{
 		ElfHeader *elfHeader = (ElfHeader*)elfImage;
 
@@ -312,6 +312,7 @@
 				if(sectionHeader[i].sh_flags & SHF_EXECINSTR)
 				{
 					entry = elfImage + sectionHeader[i].sh_offset;
+					codeSize = sectionHeader[i].sh_size;
 				}
 			}
 			else if(sectionHeader[i].sh_type == SHT_REL)
@@ -429,7 +430,14 @@
 
 				position = std::numeric_limits<std::size_t>::max();   // Can't stream more data after this
 
-				entry = loadImage(&buffer[0]);
+				size_t codeSize = 0;
+				entry = loadImage(&buffer[0], codeSize);
+
+				#if defined(_WIN32)
+					FlushInstructionCache(GetCurrentProcess(), NULL, 0);
+				#else
+					__builtin___clear_cache((char*)entry, (char*)entry + codeSize);
+				#endif
 			}
 
 			return entry;