Alex Naidis | 7a055fd | 2016-10-01 12:23:07 +0200 | [diff] [blame] | 1 | Enabling SSE support |
Matt Sarett | 1146686 | 2016-02-19 13:41:30 -0500 | [diff] [blame] | 2 | |
| 3 | Copyright (c) 2016 Google, Inc. |
Alex Naidis | 7a055fd | 2016-10-01 12:23:07 +0200 | [diff] [blame] | 4 | Written by Mike Klein, Matt Sarett |
Matt Sarett | 1146686 | 2016-02-19 13:41:30 -0500 | [diff] [blame] | 5 | |
Alex Naidis | 7a055fd | 2016-10-01 12:23:07 +0200 | [diff] [blame] | 6 | This INSTALL file written by Glenn Randers-Pehrson, 2016. |
| 7 | |
| 8 | If you have moved intel_init.c and filter_sse2_intrinsics.c to a different |
| 9 | directory, be sure to update the '#include "../../pngpriv.h"' line in both |
| 10 | files if necessary to point to the correct relative location of pngpriv.h |
| 11 | with respect to the new location of those files. |
| 12 | |
| 13 | To enable SSE support in libpng, follow the instructions in I, II, or III, |
| 14 | below: |
| 15 | |
| 16 | I. Using patched "configure" scripts: |
| 17 | |
| 18 | First, apply intel_sse.patch in your build directory. |
| 19 | |
| 20 | patch -i contrib/intel/intel_sse.patch -p1 |
| 21 | |
| 22 | Then, if you are not building in a new GIT clone, e.g., in a tar |
| 23 | distribution, remove any existing pre-built configure scripts: |
| 24 | |
| 25 | ./configure --enable-maintainer-mode |
| 26 | make maintainer-clean |
| 27 | ./autogen.sh --maintainer --clean |
| 28 | |
| 29 | Finally, configure libpng with -DPNG_INTEL_SSE in CPPFLAGS: |
| 30 | |
| 31 | ./autogen.sh --maintainer |
| 32 | CPPFLAGS="-DPNG_INTEL_SSE" ./configure [options] |
| 33 | make CPPFLAGS="-DPNG_INTEL_SSE" [options] |
| 34 | make |
| 35 | |
| 36 | II. Using a custom makefile: |
| 37 | |
| 38 | If you are using a custom makefile makefile, you will have to update it |
| 39 | manually to include contrib/intel/*.o in the dependencies, and to define |
| 40 | PNG_INTEL_SSE. |
| 41 | |
| 42 | III. Using manually updated "configure" scripts: |
| 43 | |
| 44 | If you prefer, manually edit pngpriv.h, configure.ac, and Makefile.am, |
| 45 | following the instructions below, then follow the instructions in |
| 46 | section II of INSTALL in the main libpng directory, then configure libpng |
| 47 | with -DPNG_INTEL_SSE in CPPFLAGS. |
| 48 | |
| 49 | 1. Add the following code to configure.ac under HOST SPECIFIC OPTIONS |
| 50 | directly beneath the section for ARM: |
| 51 | |
| 52 | -----------------cut---------------- |
| 53 | # INTEL |
| 54 | # ===== |
| 55 | # |
| 56 | # INTEL SSE (SIMD) support. |
| 57 | |
| 58 | AC_ARG_ENABLE([intel-sse], |
| 59 | AS_HELP_STRING([[[--enable-intel-sse]]], |
| 60 | [Enable Intel SSE optimizations: =no/off, yes/on:] |
| 61 | [no/off: disable the optimizations;] |
| 62 | [yes/on: enable the optimizations.] |
| 63 | [If not specified: determined by the compiler.]), |
| 64 | [case "$enableval" in |
| 65 | no|off) |
| 66 | # disable the default enabling: |
| 67 | AC_DEFINE([PNG_INTEL_SSE_OPT], [0], |
| 68 | [Disable Intel SSE optimizations]) |
| 69 | # Prevent inclusion of the assembler files below: |
| 70 | enable_intel_sse=no;; |
| 71 | yes|on) |
| 72 | AC_DEFINE([PNG_INTEL_SSE_OPT], [1], |
| 73 | [Enable Intel SSE optimizations]);; |
| 74 | *) |
| 75 | AC_MSG_ERROR([--enable-intel-sse=${enable_intel_sse}: invalid value]) |
| 76 | esac]) |
| 77 | |
| 78 | # Add Intel specific files to all builds where the host_cpu is Intel ('x86*') |
| 79 | # or where Intel optimizations were explicitly requested (this allows a |
| 80 | # fallback if a future host CPU does not match 'x86*') |
| 81 | AM_CONDITIONAL([PNG_INTEL_SSE], |
| 82 | [test "$enable_intel_sse" != 'no' && |
| 83 | case "$host_cpu" in |
| 84 | i?86|x86_64) :;; |
| 85 | *) test "$enable_intel_sse" != '';; |
| 86 | esac]) |
| 87 | -----------------cut---------------- |
| 88 | |
| 89 | 2. Add the following code to Makefile.am under HOST SPECIFIC OPTIONS |
| 90 | directly beneath the "if PNG_ARM_NEON ... endif" statement: |
| 91 | |
| 92 | -----------------cut---------------- |
| 93 | if PNG_INTEL_SSE |
| 94 | libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SOURCES += contrib/intel/intel_init.c\ |
| 95 | contrib/intel/filter_sse2_intrinsics.c |
| 96 | endif |
| 97 | -----------------cut---------------- |
| 98 | |
| 99 | 3. Add the following lines to pngpriv.h, following the PNG_ARM_NEON_OPT |
| 100 | code: |
| 101 | |
| 102 | -----------------cut---------------- |
| 103 | #ifndef PNG_INTEL_SSE_OPT |
| 104 | # ifdef PNG_INTEL_SSE |
| 105 | /* Only check for SSE if the build configuration has been modified to |
| 106 | * enable SSE optimizations. This means that these optimizations will |
| 107 | * be off by default. See contrib/intel for more details. |
| 108 | */ |
| 109 | # if defined(__SSE4_1__) || defined(__AVX__) || defined(__SSSE3__) || \ |
| 110 | defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64) || \ |
| 111 | (defined(_M_IX86_FP) && _M_IX86_FP >= 2) |
| 112 | # define PNG_INTEL_SSE_OPT 1 |
| 113 | # endif |
| 114 | # endif |
| 115 | #endif |
| 116 | |
| 117 | #if PNG_INTEL_SSE_OPT > 0 |
| 118 | # ifndef PNG_INTEL_SSE_IMPLEMENTATION |
| 119 | # if defined(__SSE4_1__) || defined(__AVX__) |
| 120 | /* We are not actually using AVX, but checking for AVX is the best |
| 121 | way we can detect SSE4.1 and SSSE3 on MSVC. |
| 122 | */ |
| 123 | # define PNG_INTEL_SSE_IMPLEMENTATION 3 |
| 124 | # elif defined(__SSSE3__) |
| 125 | # define PNG_INTEL_SSE_IMPLEMENTATION 2 |
| 126 | # elif defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64) || \ |
| 127 | (defined(_M_IX86_FP) && _M_IX86_FP >= 2) |
| 128 | # define PNG_INTEL_SSE_IMPLEMENTATION 1 |
| 129 | # else |
| 130 | # define PNG_INTEL_SSE_IMPLEMENTATION 0 |
| 131 | # endif |
| 132 | # endif |
| 133 | |
| 134 | # if PNG_INTEL_SSE_IMPLEMENTATION > 0 |
| 135 | # define PNG_FILTER_OPTIMIZATIONS png_init_filter_functions_sse2 |
| 136 | # endif |
| 137 | #endif |
| 138 | |
| 139 | -----------------cut---------------- |
| 140 | |
| 141 | 4. Add the following lines to pngpriv.h, following the prototype for |
| 142 | png_read_filter_row_paeth4_neon: |
| 143 | |
| 144 | -----------------cut---------------- |
| 145 | PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub3_sse2,(png_row_infop |
| 146 | row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); |
| 147 | PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub4_sse2,(png_row_infop |
| 148 | row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); |
| 149 | PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg3_sse2,(png_row_infop |
| 150 | row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); |
| 151 | PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg4_sse2,(png_row_infop |
| 152 | row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); |
| 153 | PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth3_sse2,(png_row_infop |
| 154 | row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); |
| 155 | PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_sse2,(png_row_infop |
| 156 | row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); |
| 157 | |
| 158 | -----------------cut---------------- |