openjdk-1.8.0
/
Ddot-intrinsic-implement.patch

diff --git a/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp b/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp
index 1e9b1cb91..c0fd37d05 100644
--- a/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp
+++ b/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp
@@ -2061,6 +2061,14 @@ public:
     ld_st(Vt, T, a, op1, op2);						\
   }

+  void ld1_d(FloatRegister Vt, int index, const Address &a) {
+    starti;
+    assert(index == 0 || index == 1, "Index must be 0 or 1 for Vx.2D");
+    f(0, 31), f(index & 1, 30);
+    f(0b001101110, 29, 21), rf(a.index(), 16), f(0b1000, 15, 12);
+    f(0b01, 11, 10), rf(a.base(), 5), rf(Vt, 0);
+  }
+
   INSN1(ld1,  0b001100010, 0b0111);
   INSN2(ld1,  0b001100010, 0b1010);
   INSN3(ld1,  0b001100010, 0b0110);
@@ -2186,6 +2194,13 @@ public:

 #undef INSN

+  void faddp_d(FloatRegister Vd, FloatRegister Vn) {
+    starti;
+    f(0b01, 31, 30), f(0b1111100, 29, 23), f(0b1, 22), f(0b11000, 21, 17);
+    f(0b0110110, 16, 10);
+    rf(Vn, 5), rf(Vd, 0);
+  }
+
 #define INSN(NAME, opc)                                                                 \
   void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm) { \
     starti;                                                                             \
diff --git a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
index f2f85df60..873da580b 100644
--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
@@ -2853,6 +2853,124 @@ void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
   eor(crc, crc, tmp);
 }

+/**
+ * Multiply and summation of 1 double-precision floating number pairs(sparse)
+ */
+void MacroAssembler::f2j_ddot_s1(Register dx, Register incx,
+                                 Register dy, Register incy) {
+  const FloatRegister tmpx = v2;
+  const FloatRegister tmpy = v3;
+
+  ld1_d(tmpx, 0, Address(dx, incx));
+  ld1_d(tmpy, 0, Address(dy, incy));
+  fmaddd(v0, tmpx, tmpy, v0);
+}
+
+/**
+ * Multiply and summation of 1 double-precision floating number pairs(dense)
+ */
+void MacroAssembler::f2j_ddot_d1(Register dx, Register dy, int size) {
+  const FloatRegister tmpx = v2;
+  const FloatRegister tmpy = v3;
+
+  ldrd(tmpx, post(dx, size));
+  ldrd(tmpy, post(dy, size));
+  fmaddd(v0, tmpx, tmpy, v0);
+}
+
+/**
+ * Multiply and summation of 4 double-precision floating numbers
+ */
+void MacroAssembler::f2j_ddot_d4(Register dx, Register dy) {
+  ld1(v2, v3, T2D, post(dx, 32));
+  ld1(v4, v5, T2D, post(dy, 32));
+  fmul(v2, T2D, v2, v4);
+  fmul(v3, T2D, v3, v5);
+  fadd(v0, T2D, v0, v2);
+  fadd(v6, T2D, v6, v3);
+}
+
+/**
+ * @param n         register containing the number of doubles in array
+ * @param dx        register pointing to input array
+ * @param incx      register containing step len for dx
+ * @param dy        register pointing to another input array
+ * @param incy      register containing step len for dy
+ * @param temp_reg  register containing loop variable
+ */
+void MacroAssembler::f2j_ddot(Register n, Register dx, Register incx,
+                              Register dy, Register incy, Register temp_reg) {
+  Label Ldot_EXIT, Ldot_S_BEGIN, Ldot_S1, Ldot_S10, Ldot_S4, Ldot_D_BEGIN,
+        Ldot_D1, Ldot_D10, Ldot_D4;
+
+  const int SZ = 8;
+
+    enter();
+    fmovd(v0, zr);
+    fmovd(v6, v0);
+
+    cmp(n, zr);
+    br(Assembler::LE, Ldot_EXIT);
+
+    cmp(incx, 1);
+    br(Assembler::NE, Ldot_S_BEGIN);
+    cmp(incy, 1);
+    br(Assembler::NE, Ldot_S_BEGIN);
+
+  BIND(Ldot_D_BEGIN);
+    asr(temp_reg, n, 2);
+    cmp(temp_reg, zr);
+    br(Assembler::LE, Ldot_D1);
+
+  BIND(Ldot_D4);
+    f2j_ddot_d4(dx, dy);
+    subs(temp_reg, temp_reg, 1);
+    br(Assembler::NE, Ldot_D4);
+
+    fadd(v0, T2D, v0, v6);
+    faddp_d(v0, v0);
+
+  BIND(Ldot_D1);
+    ands(temp_reg, n, 3);
+    br(Assembler::LE, Ldot_EXIT);
+
+  BIND(Ldot_D10);
+    f2j_ddot_d1(dx, dy, SZ);
+    subs(temp_reg, temp_reg, 1);
+    br(Assembler::NE, Ldot_D10);
+    leave();
+    ret(lr);
+
+  BIND(Ldot_S_BEGIN);
+    lsl(incx, incx, 3);
+    lsl(incy, incy, 3);
+
+    asr(temp_reg, n, 2);
+    cmp(temp_reg, zr);
+    br(Assembler::LE, Ldot_S1);
+
+  BIND(Ldot_S4);
+    f2j_ddot_s1(dx, incx, dy, incy);
+    f2j_ddot_s1(dx, incx, dy, incy);
+    f2j_ddot_s1(dx, incx, dy, incy);
+    f2j_ddot_s1(dx, incx, dy, incy);
+    subs(temp_reg, temp_reg, 1);
+    br(Assembler::NE, Ldot_S4);
+
+  BIND(Ldot_S1);
+    ands(temp_reg, n, 3);
+    br(Assembler::LE, Ldot_EXIT);
+
+  BIND(Ldot_S10);
+    f2j_ddot_s1(dx, incx, dy, incy);
+    subs(temp_reg, temp_reg, 1);
+    br(Assembler::NE, Ldot_S10);
+
+  BIND(Ldot_EXIT);
+    leave();
+    ret(lr);
+}
+
 /**
  * @param crc   register containing existing CRC (32-bit)
  * @param buf   register pointing to input byte buffer (byte*)
diff --git a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
index 388177589..1abc7e3b0 100644
--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
@@ -1180,6 +1180,9 @@ public:
         Register table0, Register table1, Register table2, Register table3,
         bool upper = false);

+  void f2j_ddot(Register n, Register dx, Register incx,
+                  Register dy, Register incy, Register temp_reg);
+
   void string_compare(Register str1, Register str2,
 		      Register cnt1, Register cnt2, Register result,
 		      Register tmp1);
@@ -1236,6 +1239,11 @@ private:
   // Uses rscratch2 if the address is not directly reachable
   Address spill_address(int size, int offset, Register tmp=rscratch2);

+private:
+  void f2j_ddot_s1(Register dx, Register incx, Register dy, Register incy);
+  void f2j_ddot_d1(Register dx, Register dy, int size);
+  void f2j_ddot_d4(Register dx, Register dy);
+
 public:
   void spill(Register Rx, bool is64, int offset) {
     if (is64) {
diff --git a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
index 0d73c0c0c..337d5c1dd 100644
--- a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
+++ b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
@@ -45,6 +45,7 @@

 #include "stubRoutines_aarch64.hpp"

+
 #ifdef COMPILER2
 #include "opto/runtime.hpp"
 #endif
@@ -3220,6 +3221,39 @@ class StubGenerator: public StubCodeGenerator {
     return start;
   }

+  /**
+   *  Arguments:
+   *
+   * Inputs:
+   *   c_rarg0   - int n
+   *   c_rarg1   - double[] dx
+   *   c_rarg2   - int incx
+   *   c_rarg3   - double[] dy
+   *   c_rarg4   - int incy
+   *
+   * Output:
+   *       d0   - ddot result
+   *
+   */
+  address generate_ddotF2jBLAS() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "f2jblas_ddot");
+
+    address start = __ pc();
+
+    const Register n    = c_rarg0;
+    const Register dx   = c_rarg1;
+    const Register incx = c_rarg2;
+    const Register dy   = c_rarg3;
+    const Register incy = c_rarg4;
+
+    BLOCK_COMMENT("Entry:");
+
+    __ f2j_ddot(n, dx, incx, dy, incy, rscratch2);
+
+    return start;
+  }
+
   /**
    *  Arguments:
    *
@@ -4262,6 +4296,10 @@ class StubGenerator: public StubCodeGenerator {
       StubRoutines::_montgomerySquare = g.generate_multiply();
     }

+    if (UseF2jBLASIntrinsics) {
+      StubRoutines::_ddotF2jBLAS = generate_ddotF2jBLAS();
+    }
+
     if (UseAESIntrinsics) {
       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
diff --git a/hotspot/src/share/vm/classfile/vmSymbols.hpp b/hotspot/src/share/vm/classfile/vmSymbols.hpp
index 148f9212e..6bd8dbedd 100644
--- a/hotspot/src/share/vm/classfile/vmSymbols.hpp
+++ b/hotspot/src/share/vm/classfile/vmSymbols.hpp
@@ -852,6 +852,12 @@
    do_name(     implCompress_name,                                 "implCompress0")                                     \
    do_signature(implCompress_signature,                            "([BI)V")                                            \
                                                                                                                         \
+  /* support for com.github.fommil.netlib.F2jBLAS */                                                                    \
+  do_class(com_github_fommil_netlib_f2jblas,                       "com/github/fommil/netlib/F2jBLAS")                  \
+  do_intrinsic(_f2jblas_ddot, com_github_fommil_netlib_f2jblas, ddot_name, ddot_signature, F_R)                         \
+   do_name(     ddot_name,                                         "ddot")                                              \
+   do_signature(ddot_signature,                                    "(I[DI[DI)D")                                        \
+                                                                                                                        \
   /* support for sun.security.provider.SHA2 */                                                                          \
   do_class(sun_security_provider_sha2,                             "sun/security/provider/SHA2")                        \
   do_intrinsic(_sha2_implCompress, sun_security_provider_sha2, implCompress_name, implCompress_signature, F_R)          \
diff --git a/hotspot/src/share/vm/oops/method.cpp b/hotspot/src/share/vm/oops/method.cpp
index 24fae4d30..64cdae9c7 100644
--- a/hotspot/src/share/vm/oops/method.cpp
+++ b/hotspot/src/share/vm/oops/method.cpp
@@ -1281,7 +1281,9 @@ vmSymbols::SID Method::klass_id_for_intrinsics(Klass* holder) {
   // which does not use the class default class loader so we check for its loader here
   InstanceKlass* ik = InstanceKlass::cast(holder);
   if ((ik->class_loader() != NULL) && !SystemDictionary::is_ext_class_loader(ik->class_loader())) {
-    return vmSymbols::NO_SID;   // regardless of name, no intrinsics here
+    if (!EnableIntrinsicExternal) {
+      return vmSymbols::NO_SID;   // regardless of name, no intrinsics here
+    }
   }

   // see if the klass name is well-known:
diff --git a/hotspot/src/share/vm/opto/escape.cpp b/hotspot/src/share/vm/opto/escape.cpp
index 9ef1c5e69..aa1b1ac3a 100644
--- a/hotspot/src/share/vm/opto/escape.cpp
+++ b/hotspot/src/share/vm/opto/escape.cpp
@@ -978,7 +978,8 @@ void ConnectionGraph::process_call_arguments(CallNode *call) {
                   strcmp(call->as_CallLeaf()->_name, "squareToLen") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "mulAdd") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "montgomery_multiply") == 0 ||
-                  strcmp(call->as_CallLeaf()->_name, "montgomery_square") == 0)
+                  strcmp(call->as_CallLeaf()->_name, "montgomery_square") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "f2jblas_ddot") == 0)
                  ))) {
             call->dump();
             fatal(err_msg_res("EA unexpected CallLeaf %s", call->as_CallLeaf()->_name));
diff --git a/hotspot/src/share/vm/opto/library_call.cpp b/hotspot/src/share/vm/opto/library_call.cpp
index 89ebabe6f..5cbc0f012 100644
--- a/hotspot/src/share/vm/opto/library_call.cpp
+++ b/hotspot/src/share/vm/opto/library_call.cpp
@@ -335,6 +335,7 @@ class LibraryCallKit : public GraphKit {
   bool inline_mulAdd();
   bool inline_montgomeryMultiply();
   bool inline_montgomerySquare();
+  bool inline_ddotF2jBLAS();

   bool inline_profileBoolean();
 };
@@ -587,6 +588,10 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) {
     if (!UseCRC32Intrinsics) return NULL;
     break;

+  case vmIntrinsics::_f2jblas_ddot:
+    if (!UseF2jBLASIntrinsics) return NULL;
+    break;
+
   case vmIntrinsics::_incrementExactI:
   case vmIntrinsics::_addExactI:
     if (!Matcher::match_rule_supported(Op_OverflowAddI) || !UseMathExactIntrinsics) return NULL;
@@ -983,6 +988,8 @@ bool LibraryCallKit::try_to_inline(int predicate) {

   case vmIntrinsics::_profileBoolean:
     return inline_profileBoolean();
+  case vmIntrinsics::_f2jblas_ddot:
+    return inline_ddotF2jBLAS();

   default:
     // If you get here, it may be that someone has added a new intrinsic
@@ -6303,6 +6310,49 @@ bool LibraryCallKit::inline_updateBytesCRC32() {
   return true;
 }

+/**
+ * double com.github.fommil.netlib.F2jBLAS.ddot(int n, double[] dx, int incx, double[] dy, int incy)
+ */
+bool LibraryCallKit::inline_ddotF2jBLAS() {
+  assert(callee()->signature()->size() == 5, "update has 5 parameters");
+  Node* n    = argument(1);       // type: int
+  Node* dx   = argument(2);       // type: double[]
+  Node* incx = argument(3);       // type: int
+  Node* dy   = argument(4);       // type: double[]
+  Node* incy = argument(5);       // type: int
+
+  const Type* dx_type = dx->Value(&_gvn);
+  const Type* dy_type = dy->Value(&_gvn);
+  const TypeAryPtr* dx_top_src = dx_type->isa_aryptr();
+  const TypeAryPtr* dy_top_src = dy_type->isa_aryptr();
+  if (dx_top_src == NULL || dx_top_src->klass() == NULL ||
+      dy_top_src == NULL || dy_top_src->klass() == NULL) {
+    // failed array check
+    return false;
+  }
+
+  // Figure out the size and type of the elements we will be copying.
+  BasicType dx_elem = dx_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+  BasicType dy_elem = dy_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+  if (dx_elem != T_DOUBLE || dy_elem != T_DOUBLE) {
+    return false;
+  }
+
+  // 'dx_start' points to dx array + scaled offset
+  Node* dx_start = array_element_address(dx, intcon(0), dx_elem);
+  Node* dy_start = array_element_address(dy, intcon(0), dy_elem);
+
+  address stubAddr = StubRoutines::ddotF2jBLAS();
+  const char *stubName = "f2jblas_ddot";
+  Node* call;
+  call = make_runtime_call(RC_LEAF, OptoRuntime::ddotF2jBLAS_Type(),
+                           stubAddr, stubName, TypePtr::BOTTOM,
+                           n, dx_start, incx, dy_start, incy);
+  Node* result = _gvn.transform(new (C) ProjNode(call, TypeFunc::Parms));
+  set_result(result);
+  return true;
+}
+
 /**
  * Calculate CRC32 for ByteBuffer.
  * int java.util.zip.CRC32.updateByteBuffer(int crc, long buf, int off, int len)
diff --git a/hotspot/src/share/vm/opto/runtime.cpp b/hotspot/src/share/vm/opto/runtime.cpp
index ba8f42e49..f1fe4d666 100644
--- a/hotspot/src/share/vm/opto/runtime.cpp
+++ b/hotspot/src/share/vm/opto/runtime.cpp
@@ -920,6 +920,30 @@ const TypeFunc* OptoRuntime::updateBytesCRC32_Type() {
   return TypeFunc::make(domain, range);
 }

+/**
+ * double ddot(int n, double *dx, int incx, double *dy, int incy)
+ */
+const TypeFunc* OptoRuntime::ddotF2jBLAS_Type() {
+  // create input type (domain)
+  int num_args = 5;
+  int argcnt = num_args;
+  const Type** fields = TypeTuple::fields(argcnt);
+  int argp = TypeFunc::Parms;
+  fields[argp++] = TypeInt::INT;        // n
+  fields[argp++] = TypeAryPtr::DOUBLES;    // dx
+  fields[argp++] = TypeInt::INT;        // incx
+  fields[argp++] = TypeAryPtr::DOUBLES;    // dy
+  fields[argp++] = TypeInt::INT;        // incy
+  assert(argp == TypeFunc::Parms + argcnt, "correct decoding");
+  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms + argcnt, fields);
+
+  // result type needed
+  fields = TypeTuple::fields(1);
+  fields[TypeFunc::Parms + 0] = Type::DOUBLE;
+  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms + 1, fields);
+  return TypeFunc::make(domain, range);
+}
+
 // for cipherBlockChaining calls of aescrypt encrypt/decrypt, four pointers and a length, returning int
 const TypeFunc* OptoRuntime::cipherBlockChaining_aescrypt_Type() {
   // create input type (domain)
diff --git a/hotspot/src/share/vm/opto/runtime.hpp b/hotspot/src/share/vm/opto/runtime.hpp
index e3bdfdf9c..66d393c5c 100644
--- a/hotspot/src/share/vm/opto/runtime.hpp
+++ b/hotspot/src/share/vm/opto/runtime.hpp
@@ -317,6 +317,8 @@ private:

   static const TypeFunc* updateBytesCRC32_Type();

+  static const TypeFunc* ddotF2jBLAS_Type();
+
   // leaf on stack replacement interpreter accessor types
   static const TypeFunc* osr_end_Type();

diff --git a/hotspot/src/share/vm/runtime/globals.hpp b/hotspot/src/share/vm/runtime/globals.hpp
index 7b17e623b..520cc3187 100644
--- a/hotspot/src/share/vm/runtime/globals.hpp
+++ b/hotspot/src/share/vm/runtime/globals.hpp
@@ -743,6 +743,12 @@ class CommandLineFlags {
   product(bool, UseCRC32Intrinsics, false,                                  \
           "use intrinsics for java.util.zip.CRC32")                         \
                                                                             \
+  experimental(bool, UseF2jBLASIntrinsics, false,                           \
+          "use intrinsics for com.github.fommil.netlib.F2jBLAS on aarch64") \
+                                                                            \
+  experimental(bool, EnableIntrinsicExternal, false,                        \
+          "enable intrinsics for methods of external packages")             \
+                                                                            \
   develop(bool, TraceCallFixup, false,                                      \
           "Trace all call fixups")                                          \
                                                                             \
diff --git a/hotspot/src/share/vm/runtime/stubRoutines.cpp b/hotspot/src/share/vm/runtime/stubRoutines.cpp
index d943248da..10f438bc5 100644
--- a/hotspot/src/share/vm/runtime/stubRoutines.cpp
+++ b/hotspot/src/share/vm/runtime/stubRoutines.cpp
@@ -136,6 +136,8 @@ address StubRoutines::_sha512_implCompressMB = NULL;
 address StubRoutines::_updateBytesCRC32 = NULL;
 address StubRoutines::_crc_table_adr = NULL;

+address StubRoutines::_ddotF2jBLAS = NULL;
+
 address StubRoutines::_multiplyToLen = NULL;
 address StubRoutines::_squareToLen = NULL;
 address StubRoutines::_mulAdd = NULL;
diff --git a/hotspot/src/share/vm/runtime/stubRoutines.hpp b/hotspot/src/share/vm/runtime/stubRoutines.hpp
index e18b9127d..a4eeb910d 100644
--- a/hotspot/src/share/vm/runtime/stubRoutines.hpp
+++ b/hotspot/src/share/vm/runtime/stubRoutines.hpp
@@ -214,6 +214,8 @@ class StubRoutines: AllStatic {
   static address _updateBytesCRC32;
   static address _crc_table_adr;

+  static address _ddotF2jBLAS;
+
   static address _multiplyToLen;
   static address _squareToLen;
   static address _mulAdd;
@@ -377,6 +379,8 @@ class StubRoutines: AllStatic {
   static address updateBytesCRC32()    { return _updateBytesCRC32; }
   static address crc_table_addr()      { return _crc_table_adr; }

+  static address ddotF2jBLAS()         { return _ddotF2jBLAS; }
+
   static address multiplyToLen()       {return _multiplyToLen; }
   static address squareToLen()         {return _squareToLen; }
   static address mulAdd()              {return _mulAdd; }