From 04d4309b14515ee7f0bbd23360a61c1ee02a795a Mon Sep 17 00:00:00 2001
From: rdb <git@rdb.name>
Date: Sun, 1 Mar 2015 11:46:52 +0100
Subject: [PATCH] SSE2-enhanced sRGB encode (with runtime CPU detection)

---
 .../interfaceMakerPythonNative.cxx            |   9 +-
 makepanda/makepanda.py                        |  12 +-
 panda/src/gobj/texture.cxx                    | 141 +++++++-------
 panda/src/gobj/texture.h                      |   7 +
 panda/src/pnmimage/convert_srgb.I             | 175 ++++++++++++++++++
 panda/src/pnmimage/convert_srgb.cxx           | 165 +++++++++++++++++
 panda/src/pnmimage/convert_srgb.h             |  59 ++++++
 panda/src/pnmimage/convert_srgb_sse2.cxx      | 151 +++++++++++++++
 panda/src/pnmimage/p3pnmimage_composite1.cxx  |   1 +
 panda/src/tinydisplay/zbuffer.h               |   2 +-
 10 files changed, 645 insertions(+), 77 deletions(-)
 create mode 100644 panda/src/pnmimage/convert_srgb.I
 create mode 100644 panda/src/pnmimage/convert_srgb.cxx
 create mode 100644 panda/src/pnmimage/convert_srgb.h
 create mode 100644 panda/src/pnmimage/convert_srgb_sse2.cxx

diff --git a/dtool/src/interrogate/interfaceMakerPythonNative.cxx b/dtool/src/interrogate/interfaceMakerPythonNative.cxx
index 5bb52227ca..db63d2bc9d 100644
--- a/dtool/src/interrogate/interfaceMakerPythonNative.cxx
+++ b/dtool/src/interrogate/interfaceMakerPythonNative.cxx
@@ -3204,7 +3204,8 @@ write_function_instance(ostream &out, InterfaceMaker::Object *obj,
       }
       extra_convert += "PyObject *" + param_name + "_long = PyNumber_Long(" + param_name + ");";
       extra_param_check += " && " + param_name + "_long != NULL";
-      pexpr_string = "PyLong_AsUnsignedLongLong(" + param_name + "_long)";
+      pexpr_string = "(" + type->get_local_name(&parser) + ")" +
+                     "PyLong_AsUnsignedLongLong(" + param_name + "_long)";
       extra_cleanup += "Py_XDECREF(" + param_name + "_long);";
       expected_params += "unsigned long long";
       ++num_params;
@@ -3219,7 +3220,8 @@ write_function_instance(ostream &out, InterfaceMaker::Object *obj,
       }
       extra_convert += "PyObject *" + param_name + "_long = PyNumber_Long(" + param_name + ");";
       extra_param_check += " && " + param_name + "_long != NULL";
-      pexpr_string = "PyLong_AsLongLong(" + param_name + "_long)";
+      pexpr_string = "(" + type->get_local_name(&parser) + ")" +
+                     "PyLong_AsLongLong(" + param_name + "_long)";
       extra_cleanup += "Py_XDECREF(" + param_name + "_long);";
       expected_params += "long long";
       ++num_params;
@@ -3234,7 +3236,8 @@ write_function_instance(ostream &out, InterfaceMaker::Object *obj,
       }
       extra_convert += "PyObject *" + param_name + "_long = PyNumber_Long(" + param_name + ");";
       extra_param_check += " && " + param_name + "_long != NULL";
-      pexpr_string = "PyLong_AsUnsignedLong(" + param_name + "_long)";
+      pexpr_string = "(" + type->get_local_name(&parser) + ")" +
+                     "PyLong_AsUnsignedLong(" + param_name + "_long)";
       extra_cleanup += "Py_XDECREF(" + param_name + "_long);";
       expected_params += "unsigned int";
       ++num_params;
diff --git a/makepanda/makepanda.py b/makepanda/makepanda.py
index 72d30a5dab..0be433b6d7 100755
--- a/makepanda/makepanda.py
+++ b/makepanda/makepanda.py
@@ -949,7 +949,7 @@ def CompileCxx(obj,src,opts):
             if PkgSkip("TOUCHINPUT") == 0:
                 cmd += "/DWINVER=0x601 "
             cmd += "/Fo" + obj + " /nologo /c"
-            if (GetTargetArch() != 'x64' and PkgSkip("SSE2") == 0):
+            if GetTargetArch() != 'x64' and (not PkgSkip("SSE2") or 'SSE2' in opts):
                 cmd += " /arch:SSE2"
             for x in ipath: cmd += " /I" + x
             for (opt,dir) in INCDIRECTORIES:
@@ -1160,7 +1160,7 @@ def CompileCxx(obj,src,opts):
                 if optlevel >= 4 or GetTarget() == "android":
                     cmd += " -fno-rtti"
 
-        if PkgSkip("SSE2") == 0 and not arch.startswith("arm"):
+        if ('SSE2' in opts or not PkgSkip("SSE2")) and not arch.startswith("arm"):
             cmd += " -msse2"
 
         if optlevel >= 3:
@@ -1705,7 +1705,7 @@ def RunGenPyCode(target, inputs, opts):
     if (PkgSkip("PYTHON") != 0):
         return
 
-    cmdstr = sys.executable + " "
+    cmdstr = BracketNameWithQuotes(SDK["PYTHONEXEC"]) + " "
     if sys.version_info >= (2, 6):
         cmdstr += "-B "
 
@@ -1729,7 +1729,7 @@ def RunGenPyCode(target, inputs, opts):
 def FreezePy(target, inputs, opts):
     assert len(inputs) > 0
     # Make sure this function isn't called before genpycode is run.
-    cmdstr = sys.executable + " "
+    cmdstr = BracketNameWithQuotes(SDK["PYTHONEXEC"]) + " "
     if sys.version_info >= (2, 6):
         cmdstr += "-B "
 
@@ -1757,7 +1757,7 @@ def FreezePy(target, inputs, opts):
 def Package(target, inputs, opts):
     assert len(inputs) == 1
     # Invoke the ppackage script.
-    command = sys.executable + " "
+    command = BracketNameWithQuotes(SDK["PYTHONEXEC"]) + " "
     if GetOptimizeOption(opts) >= 4:
         command += "-OO "
 
@@ -3192,6 +3192,7 @@ if (not RUNTIME):
   OPTS=['DIR:panda/src/pnmimage', 'BUILDING:PANDA',  'ZLIB']
   TargetAdd('p3pnmimage_composite1.obj', opts=OPTS, input='p3pnmimage_composite1.cxx')
   TargetAdd('p3pnmimage_composite2.obj', opts=OPTS, input='p3pnmimage_composite2.cxx')
+  TargetAdd('p3pnmimage_convert_srgb_sse2.obj', opts=OPTS+['SSE2'], input='convert_srgb_sse2.cxx')
 
   OPTS=['DIR:panda/src/pnmimage', 'ZLIB']
   IGATEFILES=GetDirectoryContents('panda/src/pnmimage', ["*.h", "*_composite*.cxx"])
@@ -3621,6 +3622,7 @@ if (not RUNTIME):
   TargetAdd('libpanda.dll', input='p3pnmimagetypes_composite2.obj')
   TargetAdd('libpanda.dll', input='p3pnmimage_composite1.obj')
   TargetAdd('libpanda.dll', input='p3pnmimage_composite2.obj')
+  TargetAdd('libpanda.dll', input='p3pnmimage_convert_srgb_sse2.obj')
   TargetAdd('libpanda.dll', input='p3text_composite1.obj')
   TargetAdd('libpanda.dll', input='p3text_composite2.obj')
   TargetAdd('libpanda.dll', input='p3tform_composite1.obj')
diff --git a/panda/src/gobj/texture.cxx b/panda/src/gobj/texture.cxx
index d44903e921..08474cc445 100644
--- a/panda/src/gobj/texture.cxx
+++ b/panda/src/gobj/texture.cxx
@@ -41,6 +41,7 @@
 #include "pbitops.h"
 #include "streamReader.h"
 #include "texturePeeker.h"
+#include "convert_srgb.h"
 
 #ifdef HAVE_SQUISH
 #include <squish.h>
@@ -131,46 +132,6 @@ struct DDSHeader {
   DDSCaps2 caps;
 };
 
-// This table is used for converting unsigned char texture values in an sRGB
-// texture to linear RGB values, for use in mipmap generation.
-static float srgb_to_lrgbf[256] = {0.000000f, 0.000304f, 0.000607f, 0.000911f,
-  0.001214f, 0.001518f, 0.001821f, 0.002125f, 0.002428f, 0.002732f, 0.003035f,
-  0.003347f, 0.003677f, 0.004025f, 0.004391f, 0.004777f, 0.005182f, 0.005605f,
-  0.006049f, 0.006512f, 0.006995f, 0.007499f, 0.008023f, 0.008568f, 0.009134f,
-  0.009721f, 0.010330f, 0.010960f, 0.011612f, 0.012286f, 0.012983f, 0.013702f,
-  0.014444f, 0.015209f, 0.015996f, 0.016807f, 0.017642f, 0.018500f, 0.019382f,
-  0.020289f, 0.021219f, 0.022174f, 0.023153f, 0.024158f, 0.025187f, 0.026241f,
-  0.027321f, 0.028426f, 0.029557f, 0.030713f, 0.031896f, 0.033105f, 0.034340f,
-  0.035601f, 0.036889f, 0.038204f, 0.039546f, 0.040915f, 0.042311f, 0.043735f,
-  0.045186f, 0.046665f, 0.048172f, 0.049707f, 0.051269f, 0.052861f, 0.054480f,
-  0.056128f, 0.057805f, 0.059511f, 0.061246f, 0.063010f, 0.064803f, 0.066626f,
-  0.068478f, 0.070360f, 0.072272f, 0.074214f, 0.076185f, 0.078187f, 0.080220f,
-  0.082283f, 0.084376f, 0.086500f, 0.088656f, 0.090842f, 0.093059f, 0.095307f,
-  0.097587f, 0.099899f, 0.102242f, 0.104616f, 0.107023f, 0.109462f, 0.111932f,
-  0.114435f, 0.116971f, 0.119538f, 0.122139f, 0.124772f, 0.127438f, 0.130136f,
-  0.132868f, 0.135633f, 0.138432f, 0.141263f, 0.144128f, 0.147027f, 0.149960f,
-  0.152926f, 0.155926f, 0.158961f, 0.162029f, 0.165132f, 0.168269f, 0.171441f,
-  0.174647f, 0.177888f, 0.181164f, 0.184475f, 0.187821f, 0.191202f, 0.194618f,
-  0.198069f, 0.201556f, 0.205079f, 0.208637f, 0.212231f, 0.215861f, 0.219526f,
-  0.223228f, 0.226966f, 0.230740f, 0.234551f, 0.238398f, 0.242281f, 0.246201f,
-  0.250158f, 0.254152f, 0.258183f, 0.262251f, 0.266356f, 0.270498f, 0.274677f,
-  0.278894f, 0.283149f, 0.287441f, 0.291771f, 0.296138f, 0.300544f, 0.304987f,
-  0.309469f, 0.313989f, 0.318547f, 0.323143f, 0.327778f, 0.332452f, 0.337164f,
-  0.341914f, 0.346704f, 0.351533f, 0.356400f, 0.361307f, 0.366253f, 0.371238f,
-  0.376262f, 0.381326f, 0.386429f, 0.391572f, 0.396755f, 0.401978f, 0.407240f,
-  0.412543f, 0.417885f, 0.423268f, 0.428690f, 0.434154f, 0.439657f, 0.445201f,
-  0.450786f, 0.456411f, 0.462077f, 0.467784f, 0.473531f, 0.479320f, 0.485150f,
-  0.491021f, 0.496933f, 0.502886f, 0.508881f, 0.514918f, 0.520996f, 0.527115f,
-  0.533276f, 0.539479f, 0.545724f, 0.552011f, 0.558340f, 0.564712f, 0.571125f,
-  0.577580f, 0.584078f, 0.590619f, 0.597202f, 0.603827f, 0.610496f, 0.617207f,
-  0.623960f, 0.630757f, 0.637597f, 0.644480f, 0.651406f, 0.658375f, 0.665387f,
-  0.672443f, 0.679542f, 0.686685f, 0.693872f, 0.701102f, 0.708376f, 0.715694f,
-  0.723055f, 0.730461f, 0.737910f, 0.745404f, 0.752942f, 0.760525f, 0.768151f,
-  0.775822f, 0.783538f, 0.791298f, 0.799103f, 0.806952f, 0.814847f, 0.822786f,
-  0.830770f, 0.838799f, 0.846873f, 0.854993f, 0.863157f, 0.871367f, 0.879622f,
-  0.887923f, 0.896269f, 0.904661f, 0.913099f, 0.921582f, 0.930111f, 0.938686f,
-  0.947307f, 0.955973f, 0.964686f, 0.973445f, 0.982251f, 0.991102f, 1.000000f};
-
 ////////////////////////////////////////////////////////////////////
 //     Function: Texture::Constructor
 //       Access: Published
@@ -6990,7 +6951,13 @@ do_filter_2d_mipmap_pages(const CData *cdata,
     // We currently only support sRGB mipmap generation for
     // unsigned byte textures, due to our use of a lookup table.
     nassertv(cdata->_component_type == T_unsigned_byte);
-    filter_component = &filter_2d_unsigned_byte_srgb;
+
+    if (has_sse2_sRGB_encode()) {
+      filter_component = &filter_2d_unsigned_byte_srgb_sse2;
+    } else {
+      filter_component = &filter_2d_unsigned_byte_srgb;
+    }
+
     // Alpha is always linear.
     filter_alpha = &filter_2d_unsigned_byte;
 
@@ -7140,7 +7107,13 @@ do_filter_3d_mipmap_level(const CData *cdata,
     // We currently only support sRGB mipmap generation for
     // unsigned byte textures, due to our use of a lookup table.
     nassertv(cdata->_component_type == T_unsigned_byte);
-    filter_component = &filter_3d_unsigned_byte_srgb;
+
+    if (has_sse2_sRGB_encode()) {
+      filter_component = &filter_3d_unsigned_byte_srgb_sse2;
+    } else {
+      filter_component = &filter_3d_unsigned_byte_srgb;
+    }
+
     // Alpha is always linear.
     filter_alpha = &filter_3d_unsigned_byte;
 
@@ -7385,18 +7358,32 @@ filter_2d_unsigned_byte(unsigned char *&p, const unsigned char *&q,
 void Texture::
 filter_2d_unsigned_byte_srgb(unsigned char *&p, const unsigned char *&q,
                              size_t pixel_size, size_t row_size) {
-  float result = (srgb_to_lrgbf[q[0]] +
-                  srgb_to_lrgbf[q[pixel_size]] +
-                  srgb_to_lrgbf[q[row_size]] +
-                  srgb_to_lrgbf[q[pixel_size + row_size]]) / 4.0f;
+  float result = (decode_sRGB_float(q[0]) +
+                  decode_sRGB_float(q[pixel_size]) +
+                  decode_sRGB_float(q[row_size]) +
+                  decode_sRGB_float(q[pixel_size + row_size]));
 
-  // This is based on the formula out of the EXT_texture_sRGB
-  // specification, except the factors are multiplied with 255.0f.
-  if (result < 0.0031308f) {
-    *p = (unsigned char)(result * 3294.6f);
-  } else {
-    *p = (unsigned char)(269.025f * powf(result, 0.41666f) - 14.025f);
-  }
+  *p = encode_sRGB_uchar(result * 0.25f);
+  ++p;
+  ++q;
+}
+
+////////////////////////////////////////////////////////////////////
+//     Function: Texture::filter_2d_unsigned_byte_srgb_sse2
+//       Access: Public, Static
+//  Description: Averages a 2x2 block of pixel components into a
+//               single pixel component, for producing the next mipmap
+//               level.  Increments p and q to the next component.
+////////////////////////////////////////////////////////////////////
+void Texture::
+filter_2d_unsigned_byte_srgb_sse2(unsigned char *&p, const unsigned char *&q,
+                                  size_t pixel_size, size_t row_size) {
+  float result = (decode_sRGB_float(q[0]) +
+                  decode_sRGB_float(q[pixel_size]) +
+                  decode_sRGB_float(q[row_size]) +
+                  decode_sRGB_float(q[pixel_size + row_size]));
+
+  *p = encode_sRGB_uchar_sse2(result * 0.25f);
   ++p;
   ++q;
 }
@@ -7470,22 +7457,40 @@ filter_3d_unsigned_byte(unsigned char *&p, const unsigned char *&q,
 void Texture::
 filter_3d_unsigned_byte_srgb(unsigned char *&p, const unsigned char *&q,
                              size_t pixel_size, size_t row_size, size_t page_size) {
-  float result = (srgb_to_lrgbf[q[0]] +
-                  srgb_to_lrgbf[q[pixel_size]] +
-                  srgb_to_lrgbf[q[row_size]] +
-                  srgb_to_lrgbf[q[pixel_size + row_size]] +
-                  srgb_to_lrgbf[q[page_size]] +
-                  srgb_to_lrgbf[q[pixel_size + page_size]] +
-                  srgb_to_lrgbf[q[row_size + page_size]] +
-                  srgb_to_lrgbf[q[pixel_size + row_size + page_size]]) / 8.0f;
+  float result = (decode_sRGB_float(q[0]) +
+                  decode_sRGB_float(q[pixel_size]) +
+                  decode_sRGB_float(q[row_size]) +
+                  decode_sRGB_float(q[pixel_size + row_size]) +
+                  decode_sRGB_float(q[page_size]) +
+                  decode_sRGB_float(q[pixel_size + page_size]) +
+                  decode_sRGB_float(q[row_size + page_size]) +
+                  decode_sRGB_float(q[pixel_size + row_size + page_size]));
 
-  // This is based on the formula out of the EXT_texture_sRGB
-  // specification, except the factors are multiplied with 255.0f.
-  if (result < 0.0031308f) {
-    *p = (unsigned char)(result * 3294.6f);
-  } else {
-    *p = (unsigned char)(269.025f * powf(result, 0.41666f) - 14.025f);
-  }
+  *p = encode_sRGB_uchar(result * 0.125f);
+  ++p;
+  ++q;
+}
+
+////////////////////////////////////////////////////////////////////
+//     Function: Texture::filter_3d_unsigned_byte_srgb_sse2
+//       Access: Public, Static
+//  Description: Averages a 2x2x2 block of pixel components into a
+//               single pixel component, for producing the next mipmap
+//               level.  Increments p and q to the next component.
+////////////////////////////////////////////////////////////////////
+void Texture::
+filter_3d_unsigned_byte_srgb_sse2(unsigned char *&p, const unsigned char *&q,
+                                  size_t pixel_size, size_t row_size, size_t page_size) {
+  float result = (decode_sRGB_float(q[0]) +
+                  decode_sRGB_float(q[pixel_size]) +
+                  decode_sRGB_float(q[row_size]) +
+                  decode_sRGB_float(q[pixel_size + row_size]) +
+                  decode_sRGB_float(q[page_size]) +
+                  decode_sRGB_float(q[pixel_size + page_size]) +
+                  decode_sRGB_float(q[row_size + page_size]) +
+                  decode_sRGB_float(q[pixel_size + row_size + page_size]));
+
+  *p = encode_sRGB_uchar_sse2(result * 0.125f);
   ++p;
   ++q;
 }
diff --git a/panda/src/gobj/texture.h b/panda/src/gobj/texture.h
index c8e0571aea..66e52ed7da 100644
--- a/panda/src/gobj/texture.h
+++ b/panda/src/gobj/texture.h
@@ -749,6 +749,9 @@ private:
   static void filter_2d_unsigned_byte_srgb(unsigned char *&p,
                                            const unsigned char *&q,
                                            size_t pixel_size, size_t row_size);
+  static void filter_2d_unsigned_byte_srgb_sse2(unsigned char *&p,
+                                                const unsigned char *&q,
+                                                size_t pixel_size, size_t row_size);
   static void filter_2d_unsigned_short(unsigned char *&p,
                                        const unsigned char *&q,
                                        size_t pixel_size, size_t row_size);
@@ -763,6 +766,10 @@ private:
                                            const unsigned char *&q,
                                            size_t pixel_size, size_t row_size,
                                            size_t page_size);
+  static void filter_3d_unsigned_byte_srgb_sse2(unsigned char *&p,
+                                                const unsigned char *&q,
+                                                size_t pixel_size, size_t row_size,
+                                                size_t page_size);
   static void filter_3d_unsigned_short(unsigned char *&p,
                                        const unsigned char *&q,
                                        size_t pixel_size, size_t row_size,
diff --git a/panda/src/pnmimage/convert_srgb.I b/panda/src/pnmimage/convert_srgb.I
new file mode 100644
index 0000000000..e841748fb4
--- /dev/null
+++ b/panda/src/pnmimage/convert_srgb.I
@@ -0,0 +1,175 @@
+// Filename: convert_srgb.I
+// Created by:  rdb (29Oct14)
+//
+////////////////////////////////////////////////////////////////////
+//
+// PANDA 3D SOFTWARE
+// Copyright (c) Carnegie Mellon University.  All rights reserved.
+//
+// All use of this software is subject to the terms of the revised BSD
+// license.  You should have received a copy of this license along
+// with this source code in a file named "LICENSE."
+//
+////////////////////////////////////////////////////////////////////
+
+
+////////////////////////////////////////////////////////////////////
+//     Function: decode_sRGB_float
+//  Description: Decodes the sRGB-encoded unsigned char value to
+//               a linearized float in the range 0-1.
+////////////////////////////////////////////////////////////////////
+CONSTEXPR float decode_sRGB_float(unsigned char val) {
+  return to_linear_float_table[val];
+}
+
+////////////////////////////////////////////////////////////////////
+//     Function: encode_sRGB_float
+//  Description: Decodes the sRGB-encoded floating-point value in
+//               the range 0-1 to a linearized float in the range
+//               0-1.  Inputs outside this range produce invalid
+//               results.
+////////////////////////////////////////////////////////////////////
+INLINE float decode_sRGB_float(float val) {
+  return (val <= 0.04045f)
+    ? (val * (1.f / 12.92f))
+    : cpow((val + 0.055f) * (1.f / 1.055f), 2.4f);
+}
+
+////////////////////////////////////////////////////////////////////
+//     Function: decode_sRGB_uchar
+//  Description: Decodes the sRGB-encoded unsigned char value to
+//               a linearized unsigned char value.
+////////////////////////////////////////////////////////////////////
+CONSTEXPR unsigned char decode_sRGB_uchar(unsigned char val) {
+  return to_linear_uchar_table[val];
+}
+
+////////////////////////////////////////////////////////////////////
+//     Function: decode_sRGB_uchar
+//  Description: Decodes the sRGB-encoded floating-point value in
+//               the range 0-1 to a linearized unsigned char value.
+//               Inputs outside this range are clamped.
+////////////////////////////////////////////////////////////////////
+INLINE unsigned char decode_sRGB_uchar(float val) {
+  return (val <= 0.04045f)
+    ? (unsigned char)(max(0.f, val) * (255.f / 12.92f) + 0.5f)
+    : (unsigned char)(cpow((min(val, 1.f) + 0.055f) * (1.f / 1.055f), 2.4f) * 255.f + 0.5f);
+}
+
+////////////////////////////////////////////////////////////////////
+//     Function: encode_sRGB_float
+//  Description: Encodes the linearized unsigned char value to an
+//               sRGB-encoded floating-point value in ther range 0-1.
+////////////////////////////////////////////////////////////////////
+INLINE float
+encode_sRGB_float(unsigned char val) {
+  // This seems like a very unlikely use case, so I didn't bother
+  // making a look-up table for this.
+  return (val == 0) ? 0
+    : (1.055f * cpow((float)val * (1.f / 255.f), 0.41666f) - 0.055);
+}
+
+////////////////////////////////////////////////////////////////////
+//     Function: encode_sRGB_float
+//  Description: Encodes the linearized floating-point value in the
+//               range 0-1 to an sRGB-encoded float in the range
+//               0-1.  Inputs outside this range produce invalid
+//               results.
+////////////////////////////////////////////////////////////////////
+INLINE float
+encode_sRGB_float(float val) {
+  return (val < 0.0031308f)
+    ? (val * 12.92f)
+    : (1.055f * cpow(val, 0.41666f) - 0.055);
+}
+
+////////////////////////////////////////////////////////////////////
+//     Function: encode_sRGB_uchar
+//  Description: Encodes the linearized unsigned char value to an
+//               sRGB-encoded unsigned char value.
+////////////////////////////////////////////////////////////////////
+CONSTEXPR unsigned char
+encode_sRGB_uchar(unsigned char val) {
+  return to_srgb8_table[val];
+}
+
+////////////////////////////////////////////////////////////////////
+//     Function: encode_sRGB_uchar
+//  Description: Encodes the linearized floating-point value in the
+//               range 0-1 to an sRGB-encoded unsigned char value.
+//               Inputs outside this range are clamped.
+//
+//               When SSE2 support is known at compile time, this
+//               automatically uses an optimized version.  Otherwise,
+//               it does not attempt runtime CPU detection.  If you
+//               know that SSE2 is supported (ie. if the function
+//               has_sse2_sRGB_encode() returns true) you should
+//               call encode_sRGB_uchar_sse2 instead.
+////////////////////////////////////////////////////////////////////
+INLINE unsigned char
+encode_sRGB_uchar(float val) {
+#if defined(__SSE2__) || (_M_IX86_FP >= 2) || defined(_M_X64) || defined(_M_AMD64)
+  // Use a highly optimized approximation that has more than enough
+  // accuracy for an unsigned char.
+  return encode_sRGB_uchar_sse2(val);
+#else
+  return (val < 0.0031308f)
+    ? (unsigned char) (max(0.f, val) * 3294.6f + 0.5f)
+    : (unsigned char) (269.025f * cpow(min(val, 1.f), 0.41666f) - 13.525f);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////
+//     Function: encode_sRGB_uchar
+//  Description: Encodes the linearized floating-point color value
+//               an sRGB-encoded xel in the range 0-255.
+//
+//               When SSE2 support is known at compile time, this
+//               automatically uses an optimized version.  Otherwise,
+//               it does not attempt runtime CPU detection.  If you
+//               know that SSE2 is supported (ie. if the function
+//               has_sse2_sRGB_encode() returns true) you should
+//               call encode_sRGB_uchar_sse2 instead.
+////////////////////////////////////////////////////////////////////
+INLINE void
+encode_sRGB_uchar(const LColorf &color, xel &into) {
+#if defined(__SSE2__) || (_M_IX86_FP >= 2) || defined(_M_X64) || defined(_M_AMD64)
+  // SSE2 support compiled-in; we're guaranteed to have it.
+  encode_sRGB_uchar_sse2(color, into);
+#else
+  // Boring, slow, non-SSE2 version.
+  PPM_ASSIGN(into,
+    encode_sRGB_uchar(color[0]),
+    encode_sRGB_uchar(color[1]),
+    encode_sRGB_uchar(color[2]));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////
+//     Function: encode_sRGB_uchar
+//  Description: Encodes the linearized floating-point color value
+//               an sRGB-encoded xel and alpha in the range 0-255.
+//               The alpha value is not sRGB-encoded.
+//
+//               When SSE2 support is known at compile time, this
+//               automatically uses an optimized version.  Otherwise,
+//               it does not attempt runtime CPU detection.  If you
+//               know that SSE2 is supported (ie. if the function
+//               has_sse2_sRGB_encode() returns true) you should
+//               call encode_sRGB_uchar_sse2 instead.
+////////////////////////////////////////////////////////////////////
+INLINE void
+encode_sRGB_uchar(const LColorf &color, xel &into, xelval &into_alpha) {
+#if defined(__SSE2__) || (_M_IX86_FP >= 2) || defined(_M_X64) || defined(_M_AMD64)
+  // SSE2 support compiled-in; we're guaranteed to have it.
+  encode_sRGB_uchar_sse2(color, into, into_alpha);
+#else
+  // Boring, slow, non-SSE2 version.
+  PPM_ASSIGN(into,
+    encode_sRGB_uchar(color[0]),
+    encode_sRGB_uchar(color[1]),
+    encode_sRGB_uchar(color[2]));
+
+  into_alpha = (xelval) (color[3] * 255.f + 0.5f);
+#endif
+}
diff --git a/panda/src/pnmimage/convert_srgb.cxx b/panda/src/pnmimage/convert_srgb.cxx
new file mode 100644
index 0000000000..ef27735d06
--- /dev/null
+++ b/panda/src/pnmimage/convert_srgb.cxx
@@ -0,0 +1,165 @@
+// Filename: convert_srgb.cxx
+// Created by:  rdb (13Nov14)
+//
+////////////////////////////////////////////////////////////////////
+//
+// PANDA 3D SOFTWARE
+// Copyright (c) Carnegie Mellon University.  All rights reserved.
+//
+// All use of this software is subject to the terms of the revised BSD
+// license.  You should have received a copy of this license along
+// with this source code in a file named "LICENSE."
+//
+////////////////////////////////////////////////////////////////////
+
+#include "convert_srgb.h"
+
+#ifdef __GNUC__
+#include <cpuid.h>
+#endif
+
+#ifdef _WIN32
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN 1
+#endif
+#include <windows.h>
+#endif
+
+// Lookup tables for converting from unsigned char formats.
+ALIGN_64BYTE const
+unsigned char to_srgb8_table[256] = { 0x00, 0x0d, 0x16, 0x1c, 0x22, 0x26, 0x2a,
+  0x2e, 0x32, 0x35, 0x38, 0x3b, 0x3d, 0x40, 0x42, 0x45, 0x47, 0x49, 0x4b, 0x4d,
+  0x4f, 0x51, 0x53, 0x55, 0x56, 0x58, 0x5a, 0x5c, 0x5d, 0x5f, 0x60, 0x62, 0x63,
+  0x65, 0x66, 0x68, 0x69, 0x6a, 0x6c, 0x6d, 0x6e, 0x70, 0x71, 0x72, 0x73, 0x75,
+  0x76, 0x77, 0x78, 0x79, 0x7a, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83,
+  0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90,
+  0x91, 0x92, 0x93, 0x94, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9b,
+  0x9c, 0x9d, 0x9e, 0x9f, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa3, 0xa4, 0xa5, 0xa6,
+  0xa7, 0xa7, 0xa8, 0xa9, 0xaa, 0xaa, 0xab, 0xac, 0xad, 0xad, 0xae, 0xaf, 0xaf,
+  0xb0, 0xb1, 0xb2, 0xb2, 0xb3, 0xb4, 0xb4, 0xb5, 0xb6, 0xb6, 0xb7, 0xb8, 0xb9,
+  0xb9, 0xba, 0xbb, 0xbb, 0xbc, 0xbd, 0xbd, 0xbe, 0xbe, 0xbf, 0xc0, 0xc0, 0xc1,
+  0xc2, 0xc2, 0xc3, 0xc4, 0xc4, 0xc5, 0xc5, 0xc6, 0xc7, 0xc7, 0xc8, 0xc8, 0xc9,
+  0xca, 0xca, 0xcb, 0xcb, 0xcc, 0xcd, 0xcd, 0xce, 0xce, 0xcf, 0xd0, 0xd0, 0xd1,
+  0xd1, 0xd2, 0xd2, 0xd3, 0xd4, 0xd4, 0xd5, 0xd5, 0xd6, 0xd6, 0xd7, 0xd7, 0xd8,
+  0xd8, 0xd9, 0xda, 0xda, 0xdb, 0xdb, 0xdc, 0xdc, 0xdd, 0xdd, 0xde, 0xde, 0xdf,
+  0xdf, 0xe0, 0xe0, 0xe1, 0xe2, 0xe2, 0xe3, 0xe3, 0xe4, 0xe4, 0xe5, 0xe5, 0xe6,
+  0xe6, 0xe7, 0xe7, 0xe8, 0xe8, 0xe9, 0xe9, 0xea, 0xea, 0xeb, 0xeb, 0xec, 0xec,
+  0xed, 0xed, 0xee, 0xee, 0xee, 0xef, 0xef, 0xf0, 0xf0, 0xf1, 0xf1, 0xf2, 0xf2,
+  0xf3, 0xf3, 0xf4, 0xf4, 0xf5, 0xf5, 0xf6, 0xf6, 0xf6, 0xf7, 0xf7, 0xf8, 0xf8,
+  0xf9, 0xf9, 0xfa, 0xfa, 0xfb, 0xfb, 0xfb, 0xfc, 0xfc, 0xfd, 0xfd, 0xfe, 0xfe,
+  0xff, 0xff};
+
+ALIGN_64BYTE const
+unsigned char to_linear_uchar_table[256] = { 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+  0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02,
+  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x04,
+  0x04, 0x04, 0x04, 0x04, 0x05, 0x05, 0x05, 0x05, 0x06, 0x06, 0x06, 0x06, 0x07,
+  0x07, 0x07, 0x08, 0x08, 0x08, 0x08, 0x09, 0x09, 0x09, 0x0a, 0x0a, 0x0a, 0x0b,
+  0x0b, 0x0c, 0x0c, 0x0c, 0x0d, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f, 0x0f, 0x10, 0x10,
+  0x11, 0x11, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x14, 0x15, 0x16, 0x16, 0x17,
+  0x17, 0x18, 0x18, 0x19, 0x19, 0x1a, 0x1b, 0x1b, 0x1c, 0x1d, 0x1d, 0x1e, 0x1e,
+  0x1f, 0x20, 0x20, 0x21, 0x22, 0x23, 0x23, 0x24, 0x25, 0x25, 0x26, 0x27, 0x28,
+  0x29, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33,
+  0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+  0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4c, 0x4d,
+  0x4e, 0x4f, 0x50, 0x51, 0x52, 0x54, 0x55, 0x56, 0x57, 0x58, 0x5a, 0x5b, 0x5c,
+  0x5d, 0x5f, 0x60, 0x61, 0x63, 0x64, 0x65, 0x67, 0x68, 0x69, 0x6b, 0x6c, 0x6d,
+  0x6f, 0x70, 0x72, 0x73, 0x74, 0x76, 0x77, 0x79, 0x7a, 0x7c, 0x7d, 0x7f, 0x80,
+  0x82, 0x83, 0x85, 0x86, 0x88, 0x8a, 0x8b, 0x8d, 0x8e, 0x90, 0x92, 0x93, 0x95,
+  0x97, 0x98, 0x9a, 0x9c, 0x9d, 0x9f, 0xa1, 0xa3, 0xa4, 0xa6, 0xa8, 0xaa, 0xab,
+  0xad, 0xaf, 0xb1, 0xb3, 0xb5, 0xb7, 0xb8, 0xba, 0xbc, 0xbe, 0xc0, 0xc2, 0xc4,
+  0xc6, 0xc8, 0xca, 0xcc, 0xce, 0xd0, 0xd2, 0xd4, 0xd6, 0xd8, 0xda, 0xdc, 0xde,
+  0xe0, 0xe2, 0xe5, 0xe7, 0xe9, 0xeb, 0xed, 0xef, 0xf2, 0xf4, 0xf6, 0xf8, 0xfa,
+  0xfd, 0xff};
+
+ALIGN_64BYTE
+const float to_linear_float_table[256] = { 0, 0.000304f, 0.000607f, 0.000911f,
+  0.001214f, 0.001518f, 0.001821f, 0.002125f, 0.002428f, 0.002732f, 0.003035f,
+  0.003347f, 0.003677f, 0.004025f, 0.004391f, 0.004777f, 0.005182f, 0.005605f,
+  0.006049f, 0.006512f, 0.006995f, 0.007499f, 0.008023f, 0.008568f, 0.009134f,
+  0.009721f, 0.010330f, 0.010960f, 0.011612f, 0.012286f, 0.012983f, 0.013702f,
+  0.014444f, 0.015209f, 0.015996f, 0.016807f, 0.017642f, 0.018500f, 0.019382f,
+  0.020289f, 0.021219f, 0.022174f, 0.023153f, 0.024158f, 0.025187f, 0.026241f,
+  0.027321f, 0.028426f, 0.029557f, 0.030713f, 0.031896f, 0.033105f, 0.034340f,
+  0.035601f, 0.036889f, 0.038204f, 0.039546f, 0.040915f, 0.042311f, 0.043735f,
+  0.045186f, 0.046665f, 0.048172f, 0.049707f, 0.051269f, 0.052861f, 0.054480f,
+  0.056128f, 0.057805f, 0.059511f, 0.061246f, 0.063010f, 0.064803f, 0.066626f,
+  0.068478f, 0.070360f, 0.072272f, 0.074214f, 0.076185f, 0.078187f, 0.080220f,
+  0.082283f, 0.084376f, 0.086500f, 0.088656f, 0.090842f, 0.093059f, 0.095307f,
+  0.097587f, 0.099899f, 0.102242f, 0.104616f, 0.107023f, 0.109462f, 0.111932f,
+  0.114435f, 0.116971f, 0.119538f, 0.122139f, 0.124772f, 0.127438f, 0.130136f,
+  0.132868f, 0.135633f, 0.138432f, 0.141263f, 0.144128f, 0.147027f, 0.149960f,
+  0.152926f, 0.155926f, 0.158961f, 0.162029f, 0.165132f, 0.168269f, 0.171441f,
+  0.174647f, 0.177888f, 0.181164f, 0.184475f, 0.187821f, 0.191202f, 0.194618f,
+  0.198069f, 0.201556f, 0.205079f, 0.208637f, 0.212231f, 0.215861f, 0.219526f,
+  0.223228f, 0.226966f, 0.230740f, 0.234551f, 0.238398f, 0.242281f, 0.246201f,
+  0.250158f, 0.254152f, 0.258183f, 0.262251f, 0.266356f, 0.270498f, 0.274677f,
+  0.278894f, 0.283149f, 0.287441f, 0.291771f, 0.296138f, 0.300544f, 0.304987f,
+  0.309469f, 0.313989f, 0.318547f, 0.323143f, 0.327778f, 0.332452f, 0.337164f,
+  0.341914f, 0.346704f, 0.351533f, 0.356400f, 0.361307f, 0.366253f, 0.371238f,
+  0.376262f, 0.381326f, 0.386429f, 0.391572f, 0.396755f, 0.401978f, 0.407240f,
+  0.412543f, 0.417885f, 0.423268f, 0.428690f, 0.434154f, 0.439657f, 0.445201f,
+  0.450786f, 0.456411f, 0.462077f, 0.467784f, 0.473531f, 0.479320f, 0.485150f,
+  0.491021f, 0.496933f, 0.502886f, 0.508881f, 0.514918f, 0.520996f, 0.527115f,
+  0.533276f, 0.539479f, 0.545724f, 0.552011f, 0.558340f, 0.564712f, 0.571125f,
+  0.577580f, 0.584078f, 0.590619f, 0.597202f, 0.603827f, 0.610496f, 0.617207f,
+  0.623960f, 0.630757f, 0.637597f, 0.644480f, 0.651406f, 0.658375f, 0.665387f,
+  0.672443f, 0.679542f, 0.686685f, 0.693872f, 0.701102f, 0.708376f, 0.715694f,
+  0.723055f, 0.730461f, 0.737910f, 0.745404f, 0.752942f, 0.760525f, 0.768151f,
+  0.775822f, 0.783538f, 0.791298f, 0.799103f, 0.806952f, 0.814847f, 0.822786f,
+  0.830770f, 0.838799f, 0.846873f, 0.854993f, 0.863157f, 0.871367f, 0.879622f,
+  0.887923f, 0.896269f, 0.904661f, 0.913099f, 0.921582f, 0.930111f, 0.938686f,
+  0.947307f, 0.955973f, 0.964686f, 0.973445f, 0.982251f, 0.991102f, 1.000000f};
+
+
+#if defined(__SSE2__) || (_M_IX86_FP >= 2) || defined(_M_X64) || defined(_M_AMD64)
+// SSE2 support enabled at compile time.  No runtime detection mechanism needed.
+bool
+has_sse2_sRGB_encode() {
+  return true;
+}
+
+#else
+// SSE2 support not guaranteed.  Use a runtime detection mechanism.
+
+bool
+has_sse2_sRGB_encode() {
+#if defined(__GNUC__)
+  unsigned int a, b, c, d;
+  static const bool has_support =
+    (__get_cpuid(1, &a, &b, &c, &d) == 1 && (d & 0x04000000) != 0);
+
+#elif defined(_WIN32)
+  static const bool has_support =
+    (IsProcessorFeaturePresent(PF_XMMI64_INSTRUCTIONS_AVAILABLE) != FALSE);
+
+#else
+  static const bool has_support = false;
+#endif
+
+  if (pnmimage_cat.is_debug()) {
+    static bool checked = false;
+    if (!checked) {
+#if defined(__GNUC__) || defined(_WIN32)
+      if (has_support) {
+        pnmimage_cat.debug()
+          << "Runtime detection reports SSE2 instructions available: "
+          << "SSE2-optimized sRGB encoding routines enabled.\n";
+      } else {
+        pnmimage_cat.debug()
+          << "Runtime detection reports SSE2 instructions unavailable: "
+          << "SSE2-optimized sRGB encoding routines disabled.\n";
+      }
+#else
+      pnmimage_cat.debug()
+        << "No runtime detection mechanism for SSE2 instructions available: "
+        << "SSE2-optimized sRGB encoding routines disabled.\n";
+#endif
+      checked = true;
+    }
+  }
+
+  return has_support;
+}
+
+#endif  // __SSE2__
diff --git a/panda/src/pnmimage/convert_srgb.h b/panda/src/pnmimage/convert_srgb.h
new file mode 100644
index 0000000000..6d59019d84
--- /dev/null
+++ b/panda/src/pnmimage/convert_srgb.h
@@ -0,0 +1,59 @@
+// Filename: convert_srgb.h
+// Created by:  rdb (13Nov14)
+//
+////////////////////////////////////////////////////////////////////
+//
+// PANDA 3D SOFTWARE
+// Copyright (c) Carnegie Mellon University.  All rights reserved.
+//
+// All use of this software is subject to the terms of the revised BSD
+// license.  You should have received a copy of this license along
+// with this source code in a file named "LICENSE."
+//
+////////////////////////////////////////////////////////////////////
+
+#ifndef CONVERT_SRGB_H
+#define CONVERT_SRGB_H
+
+#include "pandabase.h"
+#include "luse.h"
+#include "pnmimage_base.h"
+
+// The below functions can encode and decode sRGB colors in various
+// representations.  Some of them are implemented using look-up tables,
+// some others using SSE2 intrinsics.
+extern EXPCL_PANDA_PNMIMAGE const unsigned char to_srgb8_table[256];
+extern EXPCL_PANDA_PNMIMAGE const unsigned char to_linear_uchar_table[256];
+extern EXPCL_PANDA_PNMIMAGE const float to_linear_float_table[256];
+
+EXPCL_PANDA_PNMIMAGE CONSTEXPR float decode_sRGB_float(unsigned char val);
+EXPCL_PANDA_PNMIMAGE INLINE float decode_sRGB_float(float val);
+EXPCL_PANDA_PNMIMAGE CONSTEXPR unsigned char decode_sRGB_uchar(unsigned char val);
+EXPCL_PANDA_PNMIMAGE INLINE unsigned char decode_sRGB_uchar(float val);
+
+EXPCL_PANDA_PNMIMAGE INLINE float encode_sRGB_float(unsigned char val);
+EXPCL_PANDA_PNMIMAGE INLINE float encode_sRGB_float(float val);
+EXPCL_PANDA_PNMIMAGE CONSTEXPR unsigned char encode_sRGB_uchar(unsigned char val);
+EXPCL_PANDA_PNMIMAGE INLINE unsigned char encode_sRGB_uchar(float val);
+
+// These functions convert more than one component in one go,
+// which can be faster due to vectorization.
+EXPCL_PANDA_PNMIMAGE INLINE void encode_sRGB_uchar(const LColorf &from,
+                                                   xel &into);
+EXPCL_PANDA_PNMIMAGE INLINE void encode_sRGB_uchar(const LColorf &from,
+                                                   xel &into, xelval &into_alpha);
+
+// Use these functions if you know that SSE2 support is available.
+// Otherwise, they will crash!
+EXPCL_PANDA_PNMIMAGE unsigned char encode_sRGB_uchar_sse2(float val);
+EXPCL_PANDA_PNMIMAGE void encode_sRGB_uchar_sse2(const LColorf &from,
+                                                 xel &into);
+EXPCL_PANDA_PNMIMAGE void encode_sRGB_uchar_sse2(const LColorf &from,
+                                                 xel &into, xelval &into_alpha);
+
+// Use the following to find out if you can call either of the above.
+EXPCL_PANDA_PNMIMAGE bool has_sse2_sRGB_encode();
+
+#include "convert_srgb.I"
+
+#endif
diff --git a/panda/src/pnmimage/convert_srgb_sse2.cxx b/panda/src/pnmimage/convert_srgb_sse2.cxx
new file mode 100644
index 0000000000..e83a00394d
--- /dev/null
+++ b/panda/src/pnmimage/convert_srgb_sse2.cxx
@@ -0,0 +1,151 @@
+// Filename: convert_srgb_sse2.cxx
+// Created by:  rdb (13Nov14)
+//
+////////////////////////////////////////////////////////////////////
+//
+// PANDA 3D SOFTWARE
+// Copyright (c) Carnegie Mellon University.  All rights reserved.
+//
+// All use of this software is subject to the terms of the revised BSD
+// license.  You should have received a copy of this license along
+// with this source code in a file named "LICENSE."
+//
+////////////////////////////////////////////////////////////////////
+
+// This file should always be compiled with SSE2 support.  These
+// functions will only be called when SSE2 support is detected at
+// run-time.
+
+#include "convert_srgb.h"
+#include "luse.h"
+
+#if defined(__SSE2__) || (_M_IX86_FP >= 2) || defined(_M_X64) || defined(_M_AMD64)
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+static INLINE __m128i _encode_sRGB_sse2_mul255(__m128 val) {
+  // This an SSE2-based approximation of the sRGB encode function.
+  // It has a maximum error of around 0.001, which is by far small
+  // enough for a uchar.  It is also at least 10x as fast as the
+  // original; up to 40x when taking advantage of vectorization.
+  // Note that the fourth float is only multiplied with 255.
+
+  // Part of the code in this function is derived from:
+  // http://stackoverflow.com/a/6486630/2135754
+
+  // Clamp to 0-1 range.
+  val = _mm_max_ps(val, _mm_set1_ps(0.0f));
+  val = _mm_min_ps(val, _mm_set1_ps(1.0f));
+
+  // Pre-multiply with constant factor to adjust for exp bias.
+  __m128 xf = _mm_mul_ps(val, _mm_set1_ps(6.3307e18f));
+
+  // Approximate logarithm by... casting!
+  xf = _mm_cvtepi32_ps(_mm_castps_si128(xf));
+
+  // Multiply 'logarithm' by power.
+  xf = _mm_mul_ps(xf, _mm_set1_ps(2.0f / 3.0f));
+
+  // Reverse operation of above: cast the other way.
+  xf = _mm_castsi128_ps(_mm_cvtps_epi32(xf));
+
+  // Make an overestimate and an underestimate.
+  __m128 xover = _mm_mul_ps(val, xf);
+  __m128 xunder = _mm_mul_ps(_mm_mul_ps(val, val),
+                             _mm_rsqrt_ps(xf));
+
+  // Average the two factors, with a slight bias.
+  __m128 xavg = _mm_mul_ps(_mm_add_ps(xover, xunder),
+                           _mm_set1_ps(0.5286098f));
+
+  // Take square root twice.  Note that this is faster than
+  // the more expensive _mm_sqrt_ps instruction.
+  xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
+  xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
+
+  // Bring it into the correct range.  These factors are determined
+  // not on the basis of accuracy, but are chosen such that the
+  // decoder lookup table produces an equivalent result for any value.
+  xavg = _mm_mul_ps(xavg, _mm_set1_ps(269.122f));
+  xavg = _mm_sub_ps(xavg, _mm_set1_ps(13.55f));
+
+  // Compute the linear section.  This is also the path that
+  // the alpha channel takes, so we set the alpha multiplier
+  // to 255 (since alpha is not sRGB-converted).
+  __m128 lval = _mm_mul_ps(val,
+    _mm_set_ps(255.0f, 3294.6f, 3294.6f, 3294.6f));
+
+  lval = _mm_add_ps(lval, _mm_set1_ps(0.5f));
+
+  // Decide which version to return.  Rig the alpha
+  // comparator to always fail so that the linear path
+  // is always chosen for alpha.
+  __m128 mask = _mm_cmpge_ps(val,
+    _mm_set_ps(2.0f, 0.0031308f, 0.0031308f, 0.0031308f));
+
+  // This is a non-branching way to return one or the other value.
+  return _mm_cvttps_epi32(_mm_or_ps(
+    _mm_and_ps(mask, xavg),
+    _mm_andnot_ps(mask, lval)));
+}
+
+unsigned char
+encode_sRGB_uchar_sse2(float val) {
+  // Running only a single component through this function is still
+  // way faster than the equivalent non-SSE2 version.
+  return (unsigned char)
+    _mm_extract_epi32(_encode_sRGB_sse2_mul255(_mm_set1_ps(val)), 0);
+}
+
+void
+encode_sRGB_uchar_sse2(const LColorf &color, xel &into) {
+#ifdef LINMATH_ALIGN
+  __m128 vec = _mm_load_ps(color.get_data());
+#else
+  __m128 vec = _mm_loadu_ps(color.get_data());
+#endif
+
+  __m128i vals = _encode_sRGB_sse2_mul255(vec);
+  into.r = _mm_extract_epi32(vals, 0);
+  into.g = _mm_extract_epi32(vals, 1);
+  into.b = _mm_extract_epi32(vals, 2);
+}
+
+void
+encode_sRGB_uchar_sse2(const LColorf &color, xel &into, xelval &into_alpha) {
+#ifdef LINMATH_ALIGN
+  __m128 vec = _mm_load_ps(color.get_data());
+#else
+  __m128 vec = _mm_loadu_ps(color.get_data());
+#endif
+
+  __m128i vals = _encode_sRGB_sse2_mul255(vec);
+  into.r = _mm_extract_epi32(vals, 0);
+  into.g = _mm_extract_epi32(vals, 1);
+  into.b = _mm_extract_epi32(vals, 2);
+  into_alpha = _mm_extract_epi32(vals, 3);
+}
+
+#else
+// Somehow we're still compiling this without SSE2 support.  We'll
+// still have to define these functions, but emit a warning that the
+// build system isn't configured properly.
+#warning convert_srgb_sse2.cxx is being compiled without SSE2 support!
+
+unsigned char
+encode_sRGB_uchar_sse2(float val) {
+  return encode_sRGB_uchar(val);
+}
+
+void
+encode_sRGB_uchar_sse2(const LColorf &color, xel &into) {
+  encode_sRGB_uchar(color, into);
+}
+
+void
+encode_sRGB_uchar_sse2(const LColorf &color, xel &into, xelval &into_alpha) {
+  encode_sRGB_uchar(color, into, into_alpha);
+}
+
+#endif
diff --git a/panda/src/pnmimage/p3pnmimage_composite1.cxx b/panda/src/pnmimage/p3pnmimage_composite1.cxx
index 0e290cbc95..85dfe7f0a1 100644
--- a/panda/src/pnmimage/p3pnmimage_composite1.cxx
+++ b/panda/src/pnmimage/p3pnmimage_composite1.cxx
@@ -1,4 +1,5 @@
 #include "config_pnmimage.cxx"
+#include "convert_srgb.cxx"
 #include "pfmFile.cxx"
 #include "pnm-image-filter.cxx"
 #include "pnmbitio.cxx"
diff --git a/panda/src/tinydisplay/zbuffer.h b/panda/src/tinydisplay/zbuffer.h
index a4c8a2015c..46190ef4e6 100644
--- a/panda/src/tinydisplay/zbuffer.h
+++ b/panda/src/tinydisplay/zbuffer.h
@@ -75,7 +75,7 @@ typedef unsigned int ZPOINT;
   ((((unsigned int)(a) << 24) & 0xff000000) | (((unsigned int)(r) << 16) & 0xff0000) | (((unsigned int)(g) << 8) & 0xff00) | (unsigned int)(b))
 
 #define SRGB_TO_PIXEL(r,g,b) \
-  ((encode_sRGB[(unsigned int)(r) >> 4] << 16) | (encode_sRGB10[(unsigned int)(g) >> 4] << 8) | (encode_sRGB[(unsigned int)(b) >> 4]))
+  ((encode_sRGB[(unsigned int)(r) >> 4] << 16) | (encode_sRGB[(unsigned int)(g) >> 4] << 8) | (encode_sRGB[(unsigned int)(b) >> 4]))
 #define SRGBA_TO_PIXEL(r,g,b,a) \
   ((((unsigned int)(a) << 16) & 0xff000000) | (encode_sRGB[(unsigned int)(r) >> 4] << 16) | (encode_sRGB[(unsigned int)(g) >> 4] << 8) | (encode_sRGB[(unsigned int)(b) >> 4]))