From 1c6f4c29c4e9b31798d63d48dcc162a550d38e51 Mon Sep 17 00:00:00 2001
From: rdb <git@rdb.name>
Date: Thu, 28 Apr 2016 11:58:46 +0200
Subject: [PATCH] Add support for native FLAC reading

---
 panda/src/movies/config_movies.cxx       |    5 +
 panda/src/movies/dr_flac.h               | 2976 ++++++++++++++++++++++
 panda/src/movies/flacAudio.I             |   12 +
 panda/src/movies/flacAudio.cxx           |   64 +
 panda/src/movies/flacAudio.h             |   54 +
 panda/src/movies/flacAudioCursor.I       |   12 +
 panda/src/movies/flacAudioCursor.cxx     |  120 +
 panda/src/movies/flacAudioCursor.h       |   65 +
 panda/src/movies/p3movies_composite1.cxx |    2 +
 9 files changed, 3310 insertions(+)
 create mode 100644 panda/src/movies/dr_flac.h
 create mode 100644 panda/src/movies/flacAudio.I
 create mode 100644 panda/src/movies/flacAudio.cxx
 create mode 100644 panda/src/movies/flacAudio.h
 create mode 100644 panda/src/movies/flacAudioCursor.I
 create mode 100644 panda/src/movies/flacAudioCursor.cxx
 create mode 100644 panda/src/movies/flacAudioCursor.h

diff --git a/panda/src/movies/config_movies.cxx b/panda/src/movies/config_movies.cxx
index db5d6be074..fbd80e275d 100644
--- a/panda/src/movies/config_movies.cxx
+++ b/panda/src/movies/config_movies.cxx
@@ -13,6 +13,8 @@
 
 #include "config_movies.h"
 #include "dconfig.h"
+#include "flacAudio.h"
+#include "flacAudioCursor.h"
 #include "inkblotVideo.h"
 #include "inkblotVideoCursor.h"
 #include "microphoneAudio.h"
@@ -75,6 +77,8 @@ init_libmovies() {
   }
   initialized = true;
 
+  FlacAudio::init_type();
+  FlacAudioCursor::init_type();
   InkblotVideo::init_type();
   InkblotVideoCursor::init_type();
   MicrophoneAudio::init_type();
@@ -93,6 +97,7 @@ init_libmovies() {
 #endif
 
   MovieTypeRegistry *reg = MovieTypeRegistry::get_global_ptr();
+  reg->register_audio_type(&FlacAudio::make, "flac");
   reg->register_audio_type(&WavAudio::make, "wav wave");
 
 #ifdef HAVE_VORBIS
diff --git a/panda/src/movies/dr_flac.h b/panda/src/movies/dr_flac.h
new file mode 100644
index 0000000000..7043ec66f1
--- /dev/null
+++ b/panda/src/movies/dr_flac.h
@@ -0,0 +1,2976 @@
+// Public domain. See "unlicense" statement at the end of this file.
+//NB: modified by rdb to use 16-bit instead of 32-bit samples.
+
+// ABOUT
+//
+// This is a simple library for decoding FLAC files.
+//
+//
+//
+// USAGE
+//
+// This is a single-file library. To use it, do something like the following in one .c file.
+//   #define DR_FLAC_IMPLEMENTATION
+//   #include "dr_flac.h"
+//
+// You can then #include this file in other parts of the program as you would with any other header file. To decode audio data,
+// do something like the following:
+//
+//     drflac* pFlac = drflac_open_file("MySong.flac");
+//     if (pFlac == NULL) {
+//         ... Failed to open FLAC file ...
+//     }
+//
+//     int16_t* pSamples = malloc(pFlac->totalSampleCount * sizeof(int16_t));
+//     uint64_t numberOfSamplesActuallyRead = drflac_read_s16(pFlac, pFlac->totalSampleCount, pSamples);
+//
+//     ... pSamples now contains the decoded samples as interleaved signed 16-bit PCM ...
+//
+// The drflac object represents the decoder. It is a transparent type so all the information you need, such as the number of
+// channels and the bits per sample, should be directly accessible - just make sure you don't change their values.
+//
+// You do not need to decode the entire stream in one go - you just specify how many samples you'd like at any given time and
+// the decoder will give you as many samples as it can, up to the amount requested. Later on when you need the next batch of
+// samples, just call it again. Example:
+//
+//     while (drflac_read_s16(pFlac, chunkSize, pChunkSamples) > 0) {
+//         do_something();
+//     }
+//
+// You can seek to a specific sample with drflac_seek_to_sample(). The given sample is based on interleaving. So for example,
+// if you were to seek to the sample at index 0 in a stereo stream, you'll be seeking to the first sample of the left channel.
+// The sample at index 1 will be the first sample of the right channel. The sample at index 2 will be the second sample of the
+// left channel, etc.
+//
+//
+//
+// OPTIONS
+// #define these options before including this file.
+//
+// #define DR_FLAC_NO_STDIO
+//   Disable drflac_open_file().
+//
+// #define DR_FLAC_NO_WIN32_IO
+//   Don't use the Win32 API internally for drflac_open_file(). Setting this will force stdio FILE APIs instead. This is
+//   mainly for testing, but it's left here in case somebody might find use for it. dr_flac will use the Win32 API by
+//   default. Ignored when DR_FLAC_NO_STDIO is #defined.
+//
+// #define DR_FLAC_BUFFER_SIZE <number>
+//   Defines the size of the internal buffer to store data from onRead(). This buffer is used to reduce the number of calls
+//   back to the client for more data. Larger values means more memory, but better performance. My tests show diminishing
+//   returns after about 4KB (which is the default). Consider reducing this if you have a very efficient implementation of
+//   onRead(), or increase it if it's very inefficient.
+//
+//
+//
+// QUICK NOTES
+//
+// - Based on my own tests, the 32-bit build is about about 1.1x-1.25x slower than the reference implementation. The 64-bit
+//   build is at about parity.
+// - This should work fine with valid native FLAC files, but it won't work very well when the STREAMINFO block is unavailable
+//   and when a stream starts in the middle of a frame. This is something I plan on addressing.
+// - Audio data is retrieved as signed 16-bit PCM, regardless of the bits per sample the FLAC stream is encoded as.
+// - This has not been tested on big-endian architectures.
+// - Rice codes in unencoded binary form (see https://xiph.org/flac/format.html#rice_partition) has not been tested. If anybody
+//   knows where I can find some test files for this, let me know.
+// - Perverse and erroneous files have not been tested. Again, if you know where I can get some test files let me know.
+// - dr_flac is not thread-safe, but it's APIs can be called from any thread so long as you do your own synchronization.
+// - dr_flac does not currently do any CRC checks.
+// - Ogg encapsulation is not supported, but I want to add it at some point.
+//
+//
+//
+// TODO
+// - Implement a proper test suite.
+// - Add support for initializing the decoder without a STREAMINFO block. Build a synthethic test to get support working at at least
+//   a basic level.
+// - Add support for retrieving metadata blocks so applications can retrieve the album art or whatnot.
+// - Add support for Ogg encapsulation.
+
+#ifndef dr_flac_h
+#define dr_flac_h
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+// As data is read from the client it is placed into an internal buffer for fast access. This controls the
+// size of that buffer. Larger values means more speed, but also more memory. In my testing there is diminishing
+// returns after about 4KB, but you can fiddle with this to suit your own needs. Must be a multiple of 8.
+#ifndef DR_FLAC_BUFFER_SIZE
+#define DR_FLAC_BUFFER_SIZE   4096
+#endif
+
+// Check if we can enable 64-bit optimizations.
+#if defined(_WIN64)
+#define DRFLAC_64BIT
+#endif
+
+#if defined(__GNUC__)
+#if defined(__x86_64__) || defined(__ppc64__)
+#define DRFLAC_64BIT
+#endif
+#endif
+
+#ifdef DRFLAC_64BIT
+typedef uint64_t drflac_cache_t;
+#else
+typedef uint32_t drflac_cache_t;
+#endif
+
+
+
+// Callback for when data is read. Return value is the number of bytes actually read.
+typedef size_t (* drflac_read_proc)(void* userData, void* bufferOut, size_t bytesToRead);
+
+// Callback for when data needs to be seeked. Offset is always relative to the current position. Return value is false on failure, true success.
+typedef bool (* drflac_seek_proc)(void* userData, int offset);
+
+
+typedef struct
+{
+    // The absolute position of the first byte of the data of the block. This is just past the block's header.
+    long long pos;
+
+    // The size in bytes of the block's data.
+    unsigned int sizeInBytes;
+
+} drflac_block;
+
+typedef struct
+{
+    // The type of the subframe: SUBFRAME_CONSTANT, SUBFRAME_VERBATIM, SUBFRAME_FIXED or SUBFRAME_LPC.
+    unsigned char subframeType;
+
+    // The number of wasted bits per sample as specified by the sub-frame header.
+    unsigned char wastedBitsPerSample;
+
+    // The order to use for the prediction stage for SUBFRAME_FIXED and SUBFRAME_LPC.
+    unsigned char lpcOrder;
+
+    // The number of bits per sample for this subframe. This is not always equal to the current frame's bit per sample because
+    // an extra bit is required for side channels when interchannel decorrelation is being used.
+    int bitsPerSample;
+
+    // A pointer to the buffer containing the decoded samples in the subframe. This pointer is an offset from drflac::pHeap, or
+    // NULL if the heap is not being used. Note that it's a signed 32-bit integer for each value.
+    int32_t* pDecodedSamples;
+
+} drflac_subframe;
+
+typedef struct
+{
+    // If the stream uses variable block sizes, this will be set to the index of the first sample. If fixed block sizes are used, this will
+    // always be set to 0.
+    unsigned long long sampleNumber;
+
+    // If the stream uses fixed block sizes, this will be set to the frame number. If variable block sizes are used, this will always be 0.
+    unsigned int frameNumber;
+
+    // The sample rate of this frame.
+    unsigned int sampleRate;
+
+    // The number of samples in each sub-frame within this frame.
+    unsigned short blockSize;
+
+    // The channel assignment of this frame. This is not always set to the channel count. If interchannel decorrelation is being used this
+    // will be set to DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE, DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE or DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE.
+    unsigned char channelAssignment;
+
+    // The number of bits per sample within this frame.
+    unsigned char bitsPerSample;
+
+    // The frame's CRC. This is set, but unused at the moment.
+    unsigned char crc8;
+
+    // The number of samples left to be read in this frame. This is initially set to the block size multiplied by the channel count. As samples
+    // are read, this will be decremented. When it reaches 0, the decoder will see this frame as fully consumed and load the next frame.
+    unsigned int samplesRemaining;
+
+    // The list of sub-frames within the frame. There is one sub-frame for each channel, and there's a maximum of 8 channels.
+    drflac_subframe subframes[8];
+
+} drflac_frame;
+
+typedef struct
+{
+    // The function to call when more data needs to be read. This is set by drflac_open().
+    drflac_read_proc onRead;
+
+    // The function to call when the current read position needs to be moved.
+    drflac_seek_proc onSeek;
+
+    // The user data to pass around to onRead and onSeek.
+    void* pUserData;
+
+
+    // The sample rate. Will be set to something like 44100.
+    unsigned int sampleRate;
+
+    // The number of channels. This will be set to 1 for monaural streams, 2 for stereo, etc. Maximum 8. This is set based on the
+    // value specified in the STREAMINFO block.
+    unsigned char channels;
+
+    // The bits per sample. Will be set to somthing like 16, 24, etc.
+    unsigned char bitsPerSample;
+
+    // The maximum block size, in samples. This number represents the number of samples in each channel (not combined).
+    unsigned short maxBlockSize;
+
+    // The total number of samples making up the stream. This includes every channel. For example, if the stream has 2 channels,
+    // with each channel having a total of 4096, this value will be set to 2*4096 = 8192.
+    uint64_t totalSampleCount;
+
+
+    // The location and size of the APPLICATION block.
+    drflac_block applicationBlock;
+
+    // The location and size of the SEEKTABLE block.
+    drflac_block seektableBlock;
+
+    // The location and size of the VORBIS_COMMENT block.
+    drflac_block vorbisCommentBlock;
+
+    // The location and size of the CUESHEET block.
+    drflac_block cuesheetBlock;
+
+    // The location and size of the PICTURE block.
+    drflac_block pictureBlock;
+
+
+    // Information about the frame the decoder is currently sitting on.
+    drflac_frame currentFrame;
+
+    // The position of the first frame in the stream. This is only ever used for seeking.
+    unsigned long long firstFramePos;
+
+
+
+    // The current byte position in the client's data stream.
+    uint64_t currentBytePos;
+
+    // The index of the next valid cache line in the "L2" cache.
+    size_t nextL2Line;
+
+    // The number of bits that have been consumed by the cache. This is used to determine how many valid bits are remaining.
+    size_t consumedBits;
+
+    // Unused L2 lines. This will always be 0 until the end of the stream is hit. Used for correctly calculating the current byte
+    // position of the read pointer in the stream.
+    size_t unusedL2Lines;
+
+    // The cached data which was most recently read from the client. When data is read from the client, it is placed within this
+    // variable. As data is read, it's bit-shifted such that the next valid bit is sitting on the most significant bit.
+    drflac_cache_t cache;
+    drflac_cache_t cacheL2[DR_FLAC_BUFFER_SIZE/sizeof(drflac_cache_t)];
+
+
+    // A pointer to the decoded sample data. This is an offset of pExtraData.
+    int32_t* pDecodedSamples;
+
+    // Variable length extra data. We attach this to the end of the object so we avoid unnecessary mallocs.
+    char pExtraData[1];
+
+} drflac;
+
+
+
+
+// Opens a FLAC decoder.
+//
+// This is the lowest level function for opening a FLAC stream. You can also use drflac_open_file() and drflac_open_memory()
+// to open the stream from a file or from a block of memory respectively.
+//
+// At the moment the STREAMINFO block must be present for this to succeed.
+//
+// The onRead and onSeek callbacks are used to read and seek data provided by the client.
+static drflac* drflac_open(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData);
+
+// Closes the given FLAC decoder.
+static void drflac_close(drflac* pFlac);
+
+// Reads sample data from the given FLAC decoder, output as interleaved signed 16-bit PCM.
+//
+// Returns the number of samples actually read.
+static uint64_t drflac_read_s16(drflac* pFlac, uint64_t samplesToRead, int16_t* pBufferOut);
+
+// Seeks to the sample at the given index.
+static bool drflac_seek_to_sample(drflac* pFlac, uint64_t sampleIndex);
+
+
+
+#ifndef DR_FLAC_NO_STDIO
+// Opens a flac decoder from the file at the given path.
+static drflac* drflac_open_file(const char* pFile);
+#endif
+
+// Helper for opening a file from a pre-allocated memory buffer.
+//
+// This does not create a copy of the data. It is up to the application to ensure the buffer remains valid for
+// the lifetime of the decoder.
+static drflac* drflac_open_memory(const void* data, size_t dataSize);
+
+#endif  //dr_flac_h
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// IMPLEMENTATION
+//
+///////////////////////////////////////////////////////////////////////////////
+#ifdef DR_FLAC_IMPLEMENTATION
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#ifdef _MSC_VER
+#include <intrin.h>     // For _byteswap_ulong and _byteswap_uint64
+#endif
+
+#ifdef __linux__
+#define _BSD_SOURCE
+#include <endian.h>
+#endif
+
+#ifdef _MSC_VER
+#define DRFLAC_INLINE __forceinline
+#else
+#define DRFLAC_INLINE inline
+#endif
+
+#define DRFLAC_BLOCK_TYPE_STREAMINFO                    0
+#define DRFLAC_BLOCK_TYPE_PADDING                       1
+#define DRFLAC_BLOCK_TYPE_APPLICATION                   2
+#define DRFLAC_BLOCK_TYPE_SEEKTABLE                     3
+#define DRFLAC_BLOCK_TYPE_VORBIS_COMMENT                4
+#define DRFLAC_BLOCK_TYPE_CUESHEET                      5
+#define DRFLAC_BLOCK_TYPE_PICTURE                       6
+#define DRFLAC_BLOCK_TYPE_INVALID                       127
+
+#define DRFLAC_SUBFRAME_CONSTANT                        0
+#define DRFLAC_SUBFRAME_VERBATIM                        1
+#define DRFLAC_SUBFRAME_FIXED                           8
+#define DRFLAC_SUBFRAME_LPC                             32
+#define DRFLAC_SUBFRAME_RESERVED                        255
+
+#define DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE  0
+#define DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2 1
+
+#define DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT           0
+#define DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE             8
+#define DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE            9
+#define DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE              10
+
+typedef struct
+{
+    uint64_t firstSample;
+    uint64_t frameOffset;   // The offset from the first byte of the header of the first frame.
+    uint16_t sampleCount;
+} drflac_seekpoint;
+
+#ifndef DR_FLAC_NO_STDIO
+#if defined(DR_FLAC_NO_WIN32_IO) || !defined(_WIN32)
+#include <stdio.h>
+
+static size_t drflac__on_read_stdio(void* pUserData, void* bufferOut, size_t bytesToRead)
+{
+    return fread(bufferOut, 1, bytesToRead, (FILE*)pUserData);
+}
+
+static bool drflac__on_seek_stdio(void* pUserData, int offset)
+{
+    return fseek((FILE*)pUserData, offset, SEEK_CUR) == 0;
+}
+
+drflac* drflac_open_file(const char* filename)
+{
+    FILE* pFile;
+#ifdef _MSC_VER
+    if (fopen_s(&pFile, filename, "rb") != 0) {
+        return false;
+    }
+#else
+    pFile = fopen(filename, "rb");
+    if (pFile == NULL) {
+        return false;
+    }
+#endif
+
+    return drflac_open(drflac__on_read_stdio, drflac__on_seek_stdio, pFile);
+}
+#else
+#include <windows.h>
+
+static size_t drflac__on_read_stdio(void* pUserData, void* bufferOut, size_t bytesToRead)
+{
+    assert(bytesToRead < 0xFFFFFFFF);   // dr_flac will never request huge amounts of data at a time. This is a safe assertion.
+
+    DWORD bytesRead;
+    ReadFile((HANDLE)pUserData, bufferOut, (DWORD)bytesToRead, &bytesRead, NULL);
+
+    return (size_t)bytesRead;
+}
+
+static bool drflac__on_seek_stdio(void* pUserData, int offset)
+{
+    return SetFilePointer((HANDLE)pUserData, offset, NULL, FILE_CURRENT) != INVALID_SET_FILE_POINTER;
+}
+
+static drflac* drflac_open_file(const char* filename)
+{
+    HANDLE hFile = CreateFileA(filename, FILE_GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
+    if (hFile == INVALID_HANDLE_VALUE) {
+        return false;
+    }
+
+    return drflac_open(drflac__on_read_stdio, drflac__on_seek_stdio, (void*)hFile);
+}
+#endif
+#endif  //DR_FLAC_NO_STDIO
+
+
+typedef struct
+{
+    /// A pointer to the beginning of the data. We use a char as the type here for easy offsetting.
+    const unsigned char* data;
+
+    /// The size of the data.
+    size_t dataSize;
+
+    /// The position we're currently sitting at.
+    size_t currentReadPos;
+
+} drflac_memory;
+
+static size_t drflac__on_read_memory(void* pUserData, void* bufferOut, size_t bytesToRead)
+{
+    drflac_memory* memory = (drflac_memory*)pUserData;
+    assert(memory != NULL);
+    assert(memory->dataSize >= memory->currentReadPos);
+
+    size_t bytesRemaining = memory->dataSize - memory->currentReadPos;
+    if (bytesToRead > bytesRemaining) {
+        bytesToRead = bytesRemaining;
+    }
+
+    if (bytesToRead > 0) {
+        memcpy(bufferOut, memory->data + memory->currentReadPos, bytesToRead);
+        memory->currentReadPos += bytesToRead;
+    }
+
+    return bytesToRead;
+}
+
+static bool drflac__on_seek_memory(void* pUserData, int offset)
+{
+    drflac_memory* memory = (drflac_memory*)pUserData;
+    assert(memory != NULL);
+
+    if (offset > 0) {
+        if (memory->currentReadPos + offset > memory->dataSize) {
+            offset = (int)(memory->dataSize - memory->currentReadPos);     // Trying to seek too far forward.
+        }
+    } else {
+        if (memory->currentReadPos < (size_t)-offset) {
+            offset = -(int)memory->currentReadPos;                  // Trying to seek too far backwards.
+        }
+    }
+
+    // This will never underflow thanks to the clamps above.
+    memory->currentReadPos += offset;
+
+    return 1;
+}
+
+static drflac* drflac_open_memory(const void* data, size_t dataSize)
+{
+    drflac_memory* pUserData = (drflac_memory*)malloc(sizeof(*pUserData));
+    if (pUserData == NULL) {
+        return false;
+    }
+
+    pUserData->data = (const unsigned char*)data;
+    pUserData->dataSize = dataSize;
+    pUserData->currentReadPos = 0;
+    return drflac_open(drflac__on_read_memory, drflac__on_seek_memory, pUserData);
+}
+
+
+//// Endian Management ////
+static DRFLAC_INLINE bool drflac__is_little_endian()
+{
+    int n = 1;
+    return (*(char*)&n) == 1;
+}
+
+static DRFLAC_INLINE uint32_t drflac__swap_endian_uint32(uint32_t n)
+{
+#ifdef _MSC_VER
+    return _byteswap_ulong(n);
+#elif defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC__ >= 3))
+    return __builtin_bswap32(n);
+#else
+    return ((n & 0xFF000000) >> 24) |
+           ((n & 0x00FF0000) >>  8) |
+           ((n & 0x0000FF00) <<  8) |
+           ((n & 0x000000FF) << 24);
+#endif
+}
+
+static DRFLAC_INLINE uint64_t drflac__swap_endian_uint64(uint64_t n)
+{
+#ifdef _MSC_VER
+    return _byteswap_uint64(n);
+#elif defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC__ >= 3))
+    return __builtin_bswap64(n);
+#else
+    return ((n & 0xFF00000000000000ULL) >> 56) |
+           ((n & 0x00FF000000000000ULL) >> 40) |
+           ((n & 0x0000FF0000000000ULL) >> 24) |
+           ((n & 0x000000FF00000000ULL) >>  8) |
+           ((n & 0x00000000FF000000ULL) <<  8) |
+           ((n & 0x0000000000FF0000ULL) << 24) |
+           ((n & 0x000000000000FF00ULL) << 40) |
+           ((n & 0x00000000000000FFULL) << 56);
+#endif
+}
+
+
+static DRFLAC_INLINE uint32_t drflac__be2host_32(uint32_t n)
+{
+#ifdef __linux__
+    return be32toh(n);
+#else
+    if (drflac__is_little_endian()) {
+        return drflac__swap_endian_uint32(n);
+    }
+
+    return n;
+#endif
+}
+
+static DRFLAC_INLINE uint64_t drflac__be2host_64(uint64_t n)
+{
+#ifdef __linux__
+    return be64toh(n);
+#else
+    if (drflac__is_little_endian()) {
+        return drflac__swap_endian_uint64(n);
+    }
+
+    return n;
+#endif
+}
+
+#ifdef DRFLAC_64BIT
+#define drflac__be2host__cache_line drflac__be2host_64
+#else
+#define drflac__be2host__cache_line drflac__be2host_32
+#endif
+
+
+// BIT READING ATTEMPT #2
+//
+// This uses a 32- or 64-bit bit-shifted cache - as bits are read, the cache is shifted such that the first valid bit is sitting
+// on the most significant bit. It uses the notion of an L1 and L2 cache (borrowed from CPU architecture), where the L1 cache
+// is a 32- or 64-bit unsigned integer (depending on whether or not a 32- or 64-bit build is being compiled) and the L2 is an
+// array of "cache lines", with each cache line being the same size as the L1. The L2 is a buffer of about 4KB and is where data
+// from onRead() is read into.
+#define DRFLAC_CACHE_L1_SIZE_BYTES                  (sizeof(pFlac->cache))
+#define DRFLAC_CACHE_L1_SIZE_BITS                   (sizeof(pFlac->cache)*8)
+#define DRFLAC_CACHE_L1_BITS_REMAINING              (DRFLAC_CACHE_L1_SIZE_BITS - (pFlac->consumedBits))
+#ifdef DRFLAC_64BIT
+#define DRFLAC_CACHE_L1_SELECTION_MASK(_bitCount)   (~(((uint64_t)-1LL) >> (_bitCount)))
+#else
+#define DRFLAC_CACHE_L1_SELECTION_MASK(_bitCount)   (~(((uint32_t)-1) >> (_bitCount)))
+#endif
+#define DRFLAC_CACHE_L1_SELECTION_SHIFT(_bitCount)  (DRFLAC_CACHE_L1_SIZE_BITS - (_bitCount))
+#define DRFLAC_CACHE_L1_SELECT(_bitCount)           ((pFlac->cache) & DRFLAC_CACHE_L1_SELECTION_MASK(_bitCount))
+#define DRFLAC_CACHE_L1_SELECT_AND_SHIFT(_bitCount) (DRFLAC_CACHE_L1_SELECT(_bitCount) >> DRFLAC_CACHE_L1_SELECTION_SHIFT(_bitCount))
+#define DRFLAC_CACHE_L2_SIZE_BYTES                  (sizeof(pFlac->cacheL2))
+#define DRFLAC_CACHE_L2_LINE_COUNT                  (DRFLAC_CACHE_L2_SIZE_BYTES / sizeof(pFlac->cacheL2[0]))
+#define DRFLAC_CACHE_L2_LINES_REMAINING             (DRFLAC_CACHE_L2_LINE_COUNT - pFlac->nextL2Line)
+
+static DRFLAC_INLINE bool drflac__reload_l1_cache_from_l2(drflac* pFlac)
+{
+    // Fast path. Try loading straight from L2.
+    if (pFlac->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT) {
+        pFlac->cache = pFlac->cacheL2[pFlac->nextL2Line++];
+        return true;
+    }
+
+    // If we get here it means we've run out of data in the L2 cache. We'll need to fetch more from the client.
+    size_t bytesRead = pFlac->onRead(pFlac->pUserData, pFlac->cacheL2, DRFLAC_CACHE_L2_SIZE_BYTES);
+    pFlac->currentBytePos += bytesRead;
+
+    pFlac->nextL2Line = 0;
+    if (bytesRead == DRFLAC_CACHE_L2_SIZE_BYTES) {
+        pFlac->cache = pFlac->cacheL2[pFlac->nextL2Line++];
+        return true;
+    }
+
+
+    // If we get here it means we were unable to retrieve enough data to fill the entire L2 cache. It probably
+    // means we've just reached the end of the file. We need to move the valid data down to the end of the buffer
+    // and adjust the index of the next line accordingly. Also keep in mind that the L2 cache must be aligned to
+    // the size of the L1 so we'll need to seek backwards by any misaligned bytes.
+    size_t alignedL1LineCount = bytesRead / DRFLAC_CACHE_L1_SIZE_BYTES;
+    if (alignedL1LineCount > 0)
+    {
+        size_t offset = DRFLAC_CACHE_L2_LINE_COUNT - alignedL1LineCount;
+        for (size_t i = alignedL1LineCount; i > 0; --i) {
+            pFlac->cacheL2[i-1 + offset] = pFlac->cacheL2[i-1];
+        }
+
+        pFlac->nextL2Line = offset;
+        pFlac->unusedL2Lines = offset;
+
+        // At this point there may be some leftover unaligned bytes. We need to seek backwards so we don't lose
+        // those bytes.
+        size_t unalignedBytes = bytesRead - (alignedL1LineCount * DRFLAC_CACHE_L1_SIZE_BYTES);
+        if (unalignedBytes > 0) {
+            pFlac->onSeek(pFlac->pUserData, -(int)unalignedBytes);
+            pFlac->currentBytePos -= unalignedBytes;
+        }
+
+        pFlac->cache = pFlac->cacheL2[pFlac->nextL2Line++];
+        return true;
+    }
+    else
+    {
+        // If we get into this branch it means we weren't able to load any L1-aligned data. We just need to seek
+        // backwards by the leftover bytes and return false.
+        if (bytesRead > 0) {
+            pFlac->onSeek(pFlac->pUserData, -(int)bytesRead);
+            pFlac->currentBytePos -= bytesRead;
+        }
+
+        pFlac->nextL2Line = DRFLAC_CACHE_L2_LINE_COUNT;
+        return false;
+    }
+}
+
+static bool drflac__reload_cache(drflac* pFlac)
+{
+    // Fast path. Try just moving the next value in the L2 cache to the L1 cache.
+    if (drflac__reload_l1_cache_from_l2(pFlac)) {
+        pFlac->cache = drflac__be2host__cache_line(pFlac->cache);
+        pFlac->consumedBits = 0;
+        return true;
+    }
+
+    // Slow path.
+
+    // If we get here it means we have failed to load the L1 cache from the L2. Likely we've just reached the end of the stream and the last
+    // few bytes did not meet the alignment requirements for the L2 cache. In this case we need to fall back to a slower path and read the
+    // data straight from the client into the L1 cache. This should only really happen once per stream so efficiency is not important.
+    size_t bytesRead = pFlac->onRead(pFlac->pUserData, &pFlac->cache, DRFLAC_CACHE_L1_SIZE_BYTES);
+    if (bytesRead == 0) {
+        return false;
+    }
+
+    pFlac->currentBytePos += bytesRead;
+
+    assert(bytesRead < DRFLAC_CACHE_L1_SIZE_BYTES);
+    pFlac->consumedBits = (DRFLAC_CACHE_L1_SIZE_BYTES - bytesRead) * 8;
+
+    pFlac->cache = drflac__be2host__cache_line(pFlac->cache);
+    pFlac->cache &= DRFLAC_CACHE_L1_SELECTION_MASK(DRFLAC_CACHE_L1_SIZE_BITS - pFlac->consumedBits);    // <-- Make sure the consumed bits are always set to zero. Other parts of the library depend on this property.
+    return true;
+}
+
+static bool drflac__seek_bits(drflac* pFlac, size_t bitsToSeek)
+{
+    if (bitsToSeek <= DRFLAC_CACHE_L1_BITS_REMAINING) {
+        pFlac->consumedBits += bitsToSeek;
+        pFlac->cache <<= bitsToSeek;
+        return true;
+    } else {
+        // It straddles the cached data. This function isn't called too frequently so I'm favouring simplicity here.
+        bitsToSeek -= DRFLAC_CACHE_L1_BITS_REMAINING;
+        pFlac->consumedBits += DRFLAC_CACHE_L1_BITS_REMAINING;
+        pFlac->cache = 0;
+
+        size_t wholeBytesRemaining = bitsToSeek/8;
+        if (wholeBytesRemaining > 0)
+        {
+            // The next bytes to seek will be located in the L2 cache. The problem is that the L2 cache is not byte aligned,
+            // but rather DRFLAC_CACHE_L1_SIZE_BYTES aligned (usually 4 or 8). If, for example, the number of bytes to seek is
+            // 3, we'll need to handle it in a special way.
+            size_t wholeCacheLinesRemaining = wholeBytesRemaining / DRFLAC_CACHE_L1_SIZE_BYTES;
+            if (wholeCacheLinesRemaining < DRFLAC_CACHE_L2_LINES_REMAINING)
+            {
+                wholeBytesRemaining -= wholeCacheLinesRemaining * DRFLAC_CACHE_L1_SIZE_BYTES;
+                bitsToSeek -= wholeCacheLinesRemaining * DRFLAC_CACHE_L1_SIZE_BITS;
+                pFlac->nextL2Line += wholeCacheLinesRemaining;
+            }
+            else
+            {
+                wholeBytesRemaining -= DRFLAC_CACHE_L2_LINES_REMAINING * DRFLAC_CACHE_L1_SIZE_BYTES;
+                bitsToSeek -= DRFLAC_CACHE_L2_LINES_REMAINING * DRFLAC_CACHE_L1_SIZE_BITS;
+                pFlac->nextL2Line += DRFLAC_CACHE_L2_LINES_REMAINING;
+
+                pFlac->onSeek(pFlac->pUserData, (int)wholeBytesRemaining);
+                pFlac->currentBytePos += wholeBytesRemaining;
+                bitsToSeek -= wholeBytesRemaining*8;
+            }
+        }
+
+
+        if (bitsToSeek > 0) {
+            if (!drflac__reload_cache(pFlac)) {
+                return false;
+            }
+
+            return drflac__seek_bits(pFlac, bitsToSeek);
+        }
+
+        return true;
+    }
+}
+
+static bool drflac__read_uint32(drflac* pFlac, unsigned int bitCount, uint32_t* pResultOut)
+{
+    assert(pFlac != NULL);
+    assert(pResultOut != NULL);
+    assert(bitCount > 0);
+    assert(bitCount <= 32);
+
+    if (pFlac->consumedBits == DRFLAC_CACHE_L1_SIZE_BITS) {
+        if (!drflac__reload_cache(pFlac)) {
+            return false;
+        }
+    }
+
+    if (bitCount <= DRFLAC_CACHE_L1_BITS_REMAINING) {
+        if (bitCount < DRFLAC_CACHE_L1_SIZE_BITS) {
+            *pResultOut = DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bitCount);
+            pFlac->consumedBits += bitCount;
+            pFlac->cache <<= bitCount;
+        } else {
+            *pResultOut = (uint32_t)pFlac->cache;
+            pFlac->consumedBits = DRFLAC_CACHE_L1_SIZE_BITS;
+            pFlac->cache = 0;
+        }
+        return true;
+    } else {
+        // It straddles the cached data. It will never cover more than the next chunk. We just read the number in two parts and combine them.
+        size_t bitCountHi = DRFLAC_CACHE_L1_BITS_REMAINING;
+        size_t bitCountLo = bitCount - bitCountHi;
+        uint32_t resultHi = DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bitCountHi);
+
+        if (!drflac__reload_cache(pFlac)) {
+            return false;
+        }
+
+        *pResultOut = (resultHi << bitCountLo) | DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bitCountLo);
+        pFlac->consumedBits += bitCountLo;
+        pFlac->cache <<= bitCountLo;
+        return true;
+    }
+}
+
+static bool drflac__read_int32(drflac* pFlac, unsigned int bitCount, int32_t* pResult)
+{
+    assert(pFlac != NULL);
+    assert(pResult != NULL);
+    assert(bitCount > 0);
+    assert(bitCount <= 32);
+
+    uint32_t result;
+    if (!drflac__read_uint32(pFlac, bitCount, &result)) {
+        return false;
+    }
+
+    if ((result & (1 << (bitCount - 1)))) {  // TODO: See if we can get rid of this branch.
+        result |= (-1 << bitCount);
+    }
+
+    *pResult = (int32_t)result;
+    return true;
+}
+
+static bool drflac__read_uint64(drflac* pFlac, unsigned int bitCount, uint64_t* pResultOut)
+{
+    assert(bitCount <= 64);
+    assert(bitCount >  32);
+
+    uint32_t resultHi;
+    if (!drflac__read_uint32(pFlac, bitCount - 32, &resultHi)) {
+        return false;
+    }
+
+    uint32_t resultLo;
+    if (!drflac__read_uint32(pFlac, 32, &resultLo)) {
+        return false;
+    }
+
+    *pResultOut = (((uint64_t)resultHi) << 32) | ((uint64_t)resultLo);
+    return true;
+}
+
+static bool drflac__read_int64(drflac* pFlac, unsigned int bitCount, int64_t* pResultOut)
+{
+    assert(bitCount <= 64);
+
+    uint64_t result;
+    if (!drflac__read_uint64(pFlac, bitCount, &result)) {
+        return false;
+    }
+
+    if ((result & (1ULL << (bitCount - 1)))) {  // TODO: See if we can get rid of this branch.
+        result |= (-1LL << bitCount);
+    }
+
+    *pResultOut = (int64_t)result;
+    return true;
+}
+
+static bool drflac__read_uint16(drflac* pFlac, unsigned int bitCount, uint16_t* pResult)
+{
+    assert(pFlac != NULL);
+    assert(pResult != NULL);
+    assert(bitCount > 0);
+    assert(bitCount <= 16);
+
+    uint32_t result;
+    if (!drflac__read_uint32(pFlac, bitCount, &result)) {
+        return false;
+    }
+
+    *pResult = (uint16_t)result;
+    return true;
+}
+
+static bool drflac__read_int16(drflac* pFlac, unsigned int bitCount, int16_t* pResult)
+{
+    assert(pFlac != NULL);
+    assert(pResult != NULL);
+    assert(bitCount > 0);
+    assert(bitCount <= 16);
+
+    int32_t result;
+    if (!drflac__read_int32(pFlac, bitCount, &result)) {
+        return false;
+    }
+
+    *pResult = (int16_t)result;
+    return true;
+}
+
+static bool drflac__read_uint8(drflac* pFlac, unsigned int bitCount, uint8_t* pResult)
+{
+    assert(pFlac != NULL);
+    assert(pResult != NULL);
+    assert(bitCount > 0);
+    assert(bitCount <= 8);
+
+    uint32_t result;
+    if (!drflac__read_uint32(pFlac, bitCount, &result)) {
+        return false;
+    }
+
+    *pResult = (uint8_t)result;
+    return true;
+}
+
+static bool drflac__read_int8(drflac* pFlac, unsigned int bitCount, int8_t* pResult)
+{
+    assert(pFlac != NULL);
+    assert(pResult != NULL);
+    assert(bitCount > 0);
+    assert(bitCount <= 8);
+
+    int32_t result;
+    if (!drflac__read_int32(pFlac, bitCount, &result)) {
+        return false;
+    }
+
+    *pResult = (int8_t)result;
+    return true;
+}
+
+
+static inline bool drflac__seek_past_next_set_bit(drflac* pFlac, unsigned int* pOffsetOut)
+{
+    unsigned int zeroCounter = 0;
+    while (pFlac->cache == 0) {
+        zeroCounter += (unsigned int)DRFLAC_CACHE_L1_BITS_REMAINING;
+        if (!drflac__reload_cache(pFlac)) {
+            return false;
+        }
+    }
+
+    // At this point the cache should not be zero, in which case we know the first set bit should be somewhere in here. There is
+    // no need for us to perform any cache reloading logic here which should make things much faster.
+    assert(pFlac->cache != 0);
+
+    unsigned int bitOffsetTable[] = {
+        0,
+        4,
+        3, 3,
+        2, 2, 2, 2,
+        1, 1, 1, 1, 1, 1, 1, 1
+    };
+
+    unsigned int setBitOffsetPlus1 = bitOffsetTable[DRFLAC_CACHE_L1_SELECT_AND_SHIFT(4)];
+    if (setBitOffsetPlus1 == 0) {
+        if (pFlac->cache == 1) {
+            setBitOffsetPlus1 = DRFLAC_CACHE_L1_SIZE_BITS;
+        } else {
+            setBitOffsetPlus1 = 5;
+            for (;;)
+            {
+                if ((pFlac->cache & DRFLAC_CACHE_L1_SELECT(setBitOffsetPlus1))) {
+                    break;
+                }
+
+                setBitOffsetPlus1 += 1;
+            }
+        }
+    }
+
+    pFlac->consumedBits += setBitOffsetPlus1;
+    pFlac->cache <<= setBitOffsetPlus1;
+
+    *pOffsetOut = zeroCounter + setBitOffsetPlus1 - 1;
+    return true;
+}
+
+
+
+static bool drflac__seek_to_byte(drflac* pFlac, long long offsetFromStart)
+{
+    assert(pFlac != NULL);
+
+    long long bytesToMove = offsetFromStart - pFlac->currentBytePos;
+    if (bytesToMove == 0) {
+        return 1;
+    }
+
+    if (bytesToMove > 0x7FFFFFFF) {
+        while (bytesToMove > 0x7FFFFFFF) {
+            if (!pFlac->onSeek(pFlac->pUserData, 0x7FFFFFFF)) {
+                return 0;
+            }
+
+            pFlac->currentBytePos += 0x7FFFFFFF;
+            bytesToMove -= 0x7FFFFFFF;
+        }
+    } else {
+        while (bytesToMove < (int)0x80000000) {
+            if (!pFlac->onSeek(pFlac->pUserData, (int)0x80000000)) {
+                return 0;
+            }
+
+            pFlac->currentBytePos += (int)0x80000000;
+            bytesToMove -= (int)0x80000000;
+        }
+    }
+
+    assert(bytesToMove <= 0x7FFFFFFF && bytesToMove >= (int)0x80000000);
+
+    bool result = pFlac->onSeek(pFlac->pUserData, (int)bytesToMove);    // <-- Safe cast as per the assert above.
+    pFlac->currentBytePos += (int)bytesToMove;
+
+    pFlac->consumedBits = DRFLAC_CACHE_L1_SIZE_BITS;
+    pFlac->cache = 0;
+    pFlac->nextL2Line = DRFLAC_CACHE_L2_LINE_COUNT; // <-- This clears the L2 cache.
+
+    return result;
+}
+
+static long long drflac__tell(drflac* pFlac)
+{
+    assert(pFlac != NULL);
+
+    size_t unreadBytesFromL1 = (DRFLAC_CACHE_L1_SIZE_BYTES - (pFlac->consumedBits/8));
+    size_t unreadBytesFromL2 = (DRFLAC_CACHE_L2_SIZE_BYTES - ((pFlac->nextL2Line - pFlac->unusedL2Lines)*DRFLAC_CACHE_L1_SIZE_BYTES));
+
+    return pFlac->currentBytePos - unreadBytesFromL1 - unreadBytesFromL2;
+}
+
+
+
+static bool drflac__read_utf8_coded_number(drflac* pFlac, unsigned long long* pNumberOut)
+{
+    assert(pFlac != NULL);
+    assert(pNumberOut != NULL);
+
+    // We should never need to read UTF-8 data while not being aligned to a byte boundary. Therefore we can grab the data
+    // directly from the input stream rather than using drflac__read_uint8().
+    assert((pFlac->consumedBits & 7) == 0);
+
+    unsigned char utf8[7] = {0};
+    if (!drflac__read_uint8(pFlac, 8, utf8)) {
+        *pNumberOut = 0;
+        return false;
+    }
+
+    if ((utf8[0] & 0x80) == 0) {
+        *pNumberOut = utf8[0];
+        return true;
+    }
+
+    int byteCount = 1;
+    if ((utf8[0] & 0xE0) == 0xC0) {
+        byteCount = 2;
+    } else if ((utf8[0] & 0xF0) == 0xE0) {
+        byteCount = 3;
+    } else if ((utf8[0] & 0xF8) == 0xF0) {
+        byteCount = 4;
+    } else if ((utf8[0] & 0xFC) == 0xF8) {
+        byteCount = 5;
+    } else if ((utf8[0] & 0xFE) == 0xFC) {
+        byteCount = 6;
+    } else if ((utf8[0] & 0xFF) == 0xFE) {
+        byteCount = 7;
+    } else {
+        *pNumberOut = 0;
+        return false;     // Bad UTF-8 encoding.
+    }
+
+    // Read extra bytes.
+    assert(byteCount > 1);
+
+    unsigned long long result = ((long long)(utf8[0] & (0xFF >> (byteCount + 1))));
+    for (int i = 1; i < byteCount; ++i) {
+        if (!drflac__read_uint8(pFlac, 8, utf8 + i)) {
+            *pNumberOut = 0;
+            return false;
+        }
+
+        result = (result << 6) | (utf8[i] & 0x3F);
+    }
+
+    *pNumberOut = result;
+    return true;
+}
+
+
+
+static DRFLAC_INLINE bool drflac__read_and_seek_rice(drflac* pFlac, unsigned char m)
+{
+    unsigned int unused;
+    if (!drflac__seek_past_next_set_bit(pFlac, &unused)) {
+        return false;
+    }
+
+    if (m > 0) {
+        if (!drflac__seek_bits(pFlac, m)) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+
+// The next two functions are responsible for calculating the prediction.
+//
+// When the bits per sample is >16 we need to use 64-bit integer arithmetic because otherwise we'll run out of precision. It's
+// safe to assume this will be slower on 32-bit platforms so we use a more optimal solution when the bits per sample is <=16.
+//
+//
+// Optimization Experiment #1
+//
+// The first optimization experiment I'm trying here is a loop unroll for the most common LPC orders. I've done a little test
+// and the results are as follows, in order of most common:
+// 1)  order = 8  : 93.1M
+// 2)  order = 7  : 36.6M
+// 3)  order = 3  : 33.2M
+// 4)  order = 6  : 20.9M
+// 5)  order = 5  : 18.1M
+// 6)  order = 4  : 15.8M
+// 7)  order = 12 : 10.8M
+// 8)  order = 2  :  9.8M
+// 9)  order = 1  :  1.6M
+// 10) order = 10 :  1.0M
+// 11) order = 9  :  0.8M
+// 12) order = 11 :  0.8M
+//
+// We'll experiment with unrolling the top 8 most common ones. We'll ignore the least common ones since there seems to be a
+// large drop off there.
+//
+// Result: There's a tiny improvement in some cases, but it could just be within margin of error so unsure if it's worthwhile
+// just yet.
+static DRFLAC_INLINE int32_t drflac__calculate_prediction_32(unsigned int order, int shift, const short* coefficients, int32_t* pDecodedSamples)
+{
+    assert(order <= 32);
+
+    // 32-bit version.
+
+    // This method is slower on both 32- and 64-bit builds with VC++. Leaving this here for now just in case we need it later
+    // for whatever reason.
+#if 0
+    int prediction;
+    if (order == 8)
+    {
+        prediction  = coefficients[0] * pDecodedSamples[-1];
+        prediction += coefficients[1] * pDecodedSamples[-2];
+        prediction += coefficients[2] * pDecodedSamples[-3];
+        prediction += coefficients[3] * pDecodedSamples[-4];
+        prediction += coefficients[4] * pDecodedSamples[-5];
+        prediction += coefficients[5] * pDecodedSamples[-6];
+        prediction += coefficients[6] * pDecodedSamples[-7];
+        prediction += coefficients[7] * pDecodedSamples[-8];
+    }
+    else if (order == 7)
+    {
+        prediction  = coefficients[0] * pDecodedSamples[-1];
+        prediction += coefficients[1] * pDecodedSamples[-2];
+        prediction += coefficients[2] * pDecodedSamples[-3];
+        prediction += coefficients[3] * pDecodedSamples[-4];
+        prediction += coefficients[4] * pDecodedSamples[-5];
+        prediction += coefficients[5] * pDecodedSamples[-6];
+        prediction += coefficients[6] * pDecodedSamples[-7];
+    }
+    else if (order == 3)
+    {
+        prediction  = coefficients[0] * pDecodedSamples[-1];
+        prediction += coefficients[1] * pDecodedSamples[-2];
+        prediction += coefficients[2] * pDecodedSamples[-3];
+    }
+    else if (order == 6)
+    {
+        prediction  = coefficients[0] * pDecodedSamples[-1];
+        prediction += coefficients[1] * pDecodedSamples[-2];
+        prediction += coefficients[2] * pDecodedSamples[-3];
+        prediction += coefficients[3] * pDecodedSamples[-4];
+        prediction += coefficients[4] * pDecodedSamples[-5];
+        prediction += coefficients[5] * pDecodedSamples[-6];
+    }
+    else if (order == 5)
+    {
+        prediction  = coefficients[0] * pDecodedSamples[-1];
+        prediction += coefficients[1] * pDecodedSamples[-2];
+        prediction += coefficients[2] * pDecodedSamples[-3];
+        prediction += coefficients[3] * pDecodedSamples[-4];
+        prediction += coefficients[4] * pDecodedSamples[-5];
+    }
+    else if (order == 4)
+    {
+        prediction  = coefficients[0] * pDecodedSamples[-1];
+        prediction += coefficients[1] * pDecodedSamples[-2];
+        prediction += coefficients[2] * pDecodedSamples[-3];
+        prediction += coefficients[3] * pDecodedSamples[-4];
+    }
+    else if (order == 12)
+    {
+        prediction  = coefficients[0]  * pDecodedSamples[-1];
+        prediction += coefficients[1]  * pDecodedSamples[-2];
+        prediction += coefficients[2]  * pDecodedSamples[-3];
+        prediction += coefficients[3]  * pDecodedSamples[-4];
+        prediction += coefficients[4]  * pDecodedSamples[-5];
+        prediction += coefficients[5]  * pDecodedSamples[-6];
+        prediction += coefficients[6]  * pDecodedSamples[-7];
+        prediction += coefficients[7]  * pDecodedSamples[-8];
+        prediction += coefficients[8]  * pDecodedSamples[-9];
+        prediction += coefficients[9]  * pDecodedSamples[-10];
+        prediction += coefficients[10] * pDecodedSamples[-11];
+        prediction += coefficients[11] * pDecodedSamples[-12];
+    }
+    else if (order == 2)
+    {
+        prediction  = coefficients[0] * pDecodedSamples[-1];
+        prediction += coefficients[1] * pDecodedSamples[-2];
+    }
+    else if (order == 1)
+    {
+        prediction = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+    }
+    else if (order == 10)
+    {
+        prediction  = coefficients[0]  * pDecodedSamples[-1];
+        prediction += coefficients[1]  * pDecodedSamples[-2];
+        prediction += coefficients[2]  * pDecodedSamples[-3];
+        prediction += coefficients[3]  * pDecodedSamples[-4];
+        prediction += coefficients[4]  * pDecodedSamples[-5];
+        prediction += coefficients[5]  * pDecodedSamples[-6];
+        prediction += coefficients[6]  * pDecodedSamples[-7];
+        prediction += coefficients[7]  * pDecodedSamples[-8];
+        prediction += coefficients[8]  * pDecodedSamples[-9];
+        prediction += coefficients[9]  * pDecodedSamples[-10];
+    }
+    else if (order == 9)
+    {
+        prediction  = coefficients[0]  * pDecodedSamples[-1];
+        prediction += coefficients[1]  * pDecodedSamples[-2];
+        prediction += coefficients[2]  * pDecodedSamples[-3];
+        prediction += coefficients[3]  * pDecodedSamples[-4];
+        prediction += coefficients[4]  * pDecodedSamples[-5];
+        prediction += coefficients[5]  * pDecodedSamples[-6];
+        prediction += coefficients[6]  * pDecodedSamples[-7];
+        prediction += coefficients[7]  * pDecodedSamples[-8];
+        prediction += coefficients[8]  * pDecodedSamples[-9];
+    }
+    else if (order == 11)
+    {
+        prediction  = coefficients[0]  * pDecodedSamples[-1];
+        prediction += coefficients[1]  * pDecodedSamples[-2];
+        prediction += coefficients[2]  * pDecodedSamples[-3];
+        prediction += coefficients[3]  * pDecodedSamples[-4];
+        prediction += coefficients[4]  * pDecodedSamples[-5];
+        prediction += coefficients[5]  * pDecodedSamples[-6];
+        prediction += coefficients[6]  * pDecodedSamples[-7];
+        prediction += coefficients[7]  * pDecodedSamples[-8];
+        prediction += coefficients[8]  * pDecodedSamples[-9];
+        prediction += coefficients[9]  * pDecodedSamples[-10];
+        prediction += coefficients[10] * pDecodedSamples[-11];
+    }
+    else
+    {
+        prediction = 0;
+        for (int j = 0; j < (int)order; ++j) {
+            prediction += coefficients[j] * pDecodedSamples[-j-1];
+        }
+    }
+#endif
+
+    // Experiment #2. See if we can use a switch and let the compiler optimize it to a jump table.
+    // Result: VC++ definitely optimizes this to a single jmp as expected. I expect other compilers should do the same, but I've
+    // not verified yet.
+#if 1
+    int prediction = 0;
+
+    switch (order)
+    {
+    case 32: prediction += coefficients[31] * pDecodedSamples[-32];
+    case 31: prediction += coefficients[30] * pDecodedSamples[-31];
+    case 30: prediction += coefficients[29] * pDecodedSamples[-30];
+    case 29: prediction += coefficients[28] * pDecodedSamples[-29];
+    case 28: prediction += coefficients[27] * pDecodedSamples[-28];
+    case 27: prediction += coefficients[26] * pDecodedSamples[-27];
+    case 26: prediction += coefficients[25] * pDecodedSamples[-26];
+    case 25: prediction += coefficients[24] * pDecodedSamples[-25];
+    case 24: prediction += coefficients[23] * pDecodedSamples[-24];
+    case 23: prediction += coefficients[22] * pDecodedSamples[-23];
+    case 22: prediction += coefficients[21] * pDecodedSamples[-22];
+    case 21: prediction += coefficients[20] * pDecodedSamples[-21];
+    case 20: prediction += coefficients[19] * pDecodedSamples[-20];
+    case 19: prediction += coefficients[18] * pDecodedSamples[-19];
+    case 18: prediction += coefficients[17] * pDecodedSamples[-18];
+    case 17: prediction += coefficients[16] * pDecodedSamples[-17];
+    case 16: prediction += coefficients[15] * pDecodedSamples[-16];
+    case 15: prediction += coefficients[14] * pDecodedSamples[-15];
+    case 14: prediction += coefficients[13] * pDecodedSamples[-14];
+    case 13: prediction += coefficients[12] * pDecodedSamples[-13];
+    case 12: prediction += coefficients[11] * pDecodedSamples[-12];
+    case 11: prediction += coefficients[10] * pDecodedSamples[-11];
+    case 10: prediction += coefficients[ 9] * pDecodedSamples[-10];
+    case  9: prediction += coefficients[ 8] * pDecodedSamples[- 9];
+    case  8: prediction += coefficients[ 7] * pDecodedSamples[- 8];
+    case  7: prediction += coefficients[ 6] * pDecodedSamples[- 7];
+    case  6: prediction += coefficients[ 5] * pDecodedSamples[- 6];
+    case  5: prediction += coefficients[ 4] * pDecodedSamples[- 5];
+    case  4: prediction += coefficients[ 3] * pDecodedSamples[- 4];
+    case  3: prediction += coefficients[ 2] * pDecodedSamples[- 3];
+    case  2: prediction += coefficients[ 1] * pDecodedSamples[- 2];
+    case  1: prediction += coefficients[ 0] * pDecodedSamples[- 1];
+    }
+#endif
+
+    return (int32_t)(prediction >> shift);
+}
+
+static DRFLAC_INLINE int32_t drflac__calculate_prediction(unsigned int order, int shift, const short* coefficients, int32_t* pDecodedSamples)
+{
+    assert(order <= 32);
+
+    // 64-bit version.
+
+    // This method is faster on the 32-bit build when compiling with VC++. See note below.
+#ifndef DRFLAC_64BIT
+    long long prediction;
+    if (order == 8)
+    {
+        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
+        prediction += (long long)coefficients[3] * (long long)pDecodedSamples[-4];
+        prediction += (long long)coefficients[4] * (long long)pDecodedSamples[-5];
+        prediction += (long long)coefficients[5] * (long long)pDecodedSamples[-6];
+        prediction += (long long)coefficients[6] * (long long)pDecodedSamples[-7];
+        prediction += (long long)coefficients[7] * (long long)pDecodedSamples[-8];
+    }
+    else if (order == 7)
+    {
+        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
+        prediction += (long long)coefficients[3] * (long long)pDecodedSamples[-4];
+        prediction += (long long)coefficients[4] * (long long)pDecodedSamples[-5];
+        prediction += (long long)coefficients[5] * (long long)pDecodedSamples[-6];
+        prediction += (long long)coefficients[6] * (long long)pDecodedSamples[-7];
+    }
+    else if (order == 3)
+    {
+        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
+    }
+    else if (order == 6)
+    {
+        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
+        prediction += (long long)coefficients[3] * (long long)pDecodedSamples[-4];
+        prediction += (long long)coefficients[4] * (long long)pDecodedSamples[-5];
+        prediction += (long long)coefficients[5] * (long long)pDecodedSamples[-6];
+    }
+    else if (order == 5)
+    {
+        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
+        prediction += (long long)coefficients[3] * (long long)pDecodedSamples[-4];
+        prediction += (long long)coefficients[4] * (long long)pDecodedSamples[-5];
+    }
+    else if (order == 4)
+    {
+        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
+        prediction += (long long)coefficients[3] * (long long)pDecodedSamples[-4];
+    }
+    else if (order == 12)
+    {
+        prediction  = (long long)coefficients[0]  * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1]  * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2]  * (long long)pDecodedSamples[-3];
+        prediction += (long long)coefficients[3]  * (long long)pDecodedSamples[-4];
+        prediction += (long long)coefficients[4]  * (long long)pDecodedSamples[-5];
+        prediction += (long long)coefficients[5]  * (long long)pDecodedSamples[-6];
+        prediction += (long long)coefficients[6]  * (long long)pDecodedSamples[-7];
+        prediction += (long long)coefficients[7]  * (long long)pDecodedSamples[-8];
+        prediction += (long long)coefficients[8]  * (long long)pDecodedSamples[-9];
+        prediction += (long long)coefficients[9]  * (long long)pDecodedSamples[-10];
+        prediction += (long long)coefficients[10] * (long long)pDecodedSamples[-11];
+        prediction += (long long)coefficients[11] * (long long)pDecodedSamples[-12];
+    }
+    else if (order == 2)
+    {
+        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
+    }
+    else if (order == 1)
+    {
+        prediction = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
+    }
+    else if (order == 10)
+    {
+        prediction  = (long long)coefficients[0]  * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1]  * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2]  * (long long)pDecodedSamples[-3];
+        prediction += (long long)coefficients[3]  * (long long)pDecodedSamples[-4];
+        prediction += (long long)coefficients[4]  * (long long)pDecodedSamples[-5];
+        prediction += (long long)coefficients[5]  * (long long)pDecodedSamples[-6];
+        prediction += (long long)coefficients[6]  * (long long)pDecodedSamples[-7];
+        prediction += (long long)coefficients[7]  * (long long)pDecodedSamples[-8];
+        prediction += (long long)coefficients[8]  * (long long)pDecodedSamples[-9];
+        prediction += (long long)coefficients[9]  * (long long)pDecodedSamples[-10];
+    }
+    else if (order == 9)
+    {
+        prediction  = (long long)coefficients[0]  * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1]  * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2]  * (long long)pDecodedSamples[-3];
+        prediction += (long long)coefficients[3]  * (long long)pDecodedSamples[-4];
+        prediction += (long long)coefficients[4]  * (long long)pDecodedSamples[-5];
+        prediction += (long long)coefficients[5]  * (long long)pDecodedSamples[-6];
+        prediction += (long long)coefficients[6]  * (long long)pDecodedSamples[-7];
+        prediction += (long long)coefficients[7]  * (long long)pDecodedSamples[-8];
+        prediction += (long long)coefficients[8]  * (long long)pDecodedSamples[-9];
+    }
+    else if (order == 11)
+    {
+        prediction  = (long long)coefficients[0]  * (long long)pDecodedSamples[-1];
+        prediction += (long long)coefficients[1]  * (long long)pDecodedSamples[-2];
+        prediction += (long long)coefficients[2]  * (long long)pDecodedSamples[-3];
+        prediction += (long long)coefficients[3]  * (long long)pDecodedSamples[-4];
+        prediction += (long long)coefficients[4]  * (long long)pDecodedSamples[-5];
+        prediction += (long long)coefficients[5]  * (long long)pDecodedSamples[-6];
+        prediction += (long long)coefficients[6]  * (long long)pDecodedSamples[-7];
+        prediction += (long long)coefficients[7]  * (long long)pDecodedSamples[-8];
+        prediction += (long long)coefficients[8]  * (long long)pDecodedSamples[-9];
+        prediction += (long long)coefficients[9]  * (long long)pDecodedSamples[-10];
+        prediction += (long long)coefficients[10] * (long long)pDecodedSamples[-11];
+    }
+    else
+    {
+        prediction = 0;
+        for (int j = 0; j < (int)order; ++j) {
+            prediction += (long long)coefficients[j] * (long long)pDecodedSamples[-j-1];
+        }
+    }
+#endif
+
+    // Experiment #2. See if we can use a switch and let the compiler optimize it to a single jmp instruction.
+    // Result: VC++ optimizes this to a single jmp on the 64-bit build, but for some reason the 32-bit version compiles to less efficient
+    // code. Thus, we use this version on the 64-bit build and the uglier version above for the 32-bit build. If anyone has an idea on how
+    // I can get VC++ to generate an efficient jump table for the 32-bit build let me know.
+#ifdef DRFLAC_64BIT
+    long long prediction = 0;
+
+    switch (order)
+    {
+    case 32: prediction += (long long)coefficients[31] * (long long)pDecodedSamples[-32];
+    case 31: prediction += (long long)coefficients[30] * (long long)pDecodedSamples[-31];
+    case 30: prediction += (long long)coefficients[29] * (long long)pDecodedSamples[-30];
+    case 29: prediction += (long long)coefficients[28] * (long long)pDecodedSamples[-29];
+    case 28: prediction += (long long)coefficients[27] * (long long)pDecodedSamples[-28];
+    case 27: prediction += (long long)coefficients[26] * (long long)pDecodedSamples[-27];
+    case 26: prediction += (long long)coefficients[25] * (long long)pDecodedSamples[-26];
+    case 25: prediction += (long long)coefficients[24] * (long long)pDecodedSamples[-25];
+    case 24: prediction += (long long)coefficients[23] * (long long)pDecodedSamples[-24];
+    case 23: prediction += (long long)coefficients[22] * (long long)pDecodedSamples[-23];
+    case 22: prediction += (long long)coefficients[21] * (long long)pDecodedSamples[-22];
+    case 21: prediction += (long long)coefficients[20] * (long long)pDecodedSamples[-21];
+    case 20: prediction += (long long)coefficients[19] * (long long)pDecodedSamples[-20];
+    case 19: prediction += (long long)coefficients[18] * (long long)pDecodedSamples[-19];
+    case 18: prediction += (long long)coefficients[17] * (long long)pDecodedSamples[-18];
+    case 17: prediction += (long long)coefficients[16] * (long long)pDecodedSamples[-17];
+    case 16: prediction += (long long)coefficients[15] * (long long)pDecodedSamples[-16];
+    case 15: prediction += (long long)coefficients[14] * (long long)pDecodedSamples[-15];
+    case 14: prediction += (long long)coefficients[13] * (long long)pDecodedSamples[-14];
+    case 13: prediction += (long long)coefficients[12] * (long long)pDecodedSamples[-13];
+    case 12: prediction += (long long)coefficients[11] * (long long)pDecodedSamples[-12];
+    case 11: prediction += (long long)coefficients[10] * (long long)pDecodedSamples[-11];
+    case 10: prediction += (long long)coefficients[ 9] * (long long)pDecodedSamples[-10];
+    case  9: prediction += (long long)coefficients[ 8] * (long long)pDecodedSamples[- 9];
+    case  8: prediction += (long long)coefficients[ 7] * (long long)pDecodedSamples[- 8];
+    case  7: prediction += (long long)coefficients[ 6] * (long long)pDecodedSamples[- 7];
+    case  6: prediction += (long long)coefficients[ 5] * (long long)pDecodedSamples[- 6];
+    case  5: prediction += (long long)coefficients[ 4] * (long long)pDecodedSamples[- 5];
+    case  4: prediction += (long long)coefficients[ 3] * (long long)pDecodedSamples[- 4];
+    case  3: prediction += (long long)coefficients[ 2] * (long long)pDecodedSamples[- 3];
+    case  2: prediction += (long long)coefficients[ 1] * (long long)pDecodedSamples[- 2];
+    case  1: prediction += (long long)coefficients[ 0] * (long long)pDecodedSamples[- 1];
+    }
+#endif
+
+    return (int32_t)(prediction >> shift);
+}
+
+
+// Reads and decodes a string of residual values as Rice codes. The decoder should be sitting on the first bit of the Rice codes.
+//
+// This is the most frequently called function in the library. It does both the Rice decoding and the prediction in a single loop
+// iteration.
+static bool drflac__decode_samples_with_residual__rice(drflac* pFlac, unsigned int count, unsigned char riceParam, unsigned int order, int shift, const short* coefficients, int* pSamplesOut)
+{
+    assert(pFlac != NULL);
+    assert(count > 0);
+    assert(pSamplesOut != NULL);
+
+    static unsigned int bitOffsetTable[] = {
+        0,
+        4,
+        3, 3,
+        2, 2, 2, 2,
+        1, 1, 1, 1, 1, 1, 1, 1
+    };
+
+    drflac_cache_t riceParamMask = DRFLAC_CACHE_L1_SELECTION_MASK(riceParam);
+    drflac_cache_t resultHiShift = DRFLAC_CACHE_L1_SIZE_BITS - riceParam;
+
+    for (int i = 0; i < (int)count; ++i)
+    {
+        unsigned int zeroCounter = 0;
+        while (pFlac->cache == 0) {
+            zeroCounter += (unsigned int)DRFLAC_CACHE_L1_BITS_REMAINING;
+            if (!drflac__reload_cache(pFlac)) {
+                return false;
+            }
+        }
+
+        // At this point the cache should not be zero, in which case we know the first set bit should be somewhere in here. There is
+        // no need for us to perform any cache reloading logic here which should make things much faster.
+        assert(pFlac->cache != 0);
+        unsigned int decodedRice;
+
+        unsigned int setBitOffsetPlus1 = bitOffsetTable[DRFLAC_CACHE_L1_SELECT_AND_SHIFT(4)];
+        if (setBitOffsetPlus1 > 0) {
+            decodedRice = (zeroCounter + (setBitOffsetPlus1-1)) << riceParam;
+        } else {
+            if (pFlac->cache == 1) {
+                setBitOffsetPlus1 = DRFLAC_CACHE_L1_SIZE_BITS;
+                decodedRice = (zeroCounter + (DRFLAC_CACHE_L1_SIZE_BITS-1)) << riceParam;
+            } else {
+                setBitOffsetPlus1 = 5;
+                for (;;)
+                {
+                    if ((pFlac->cache & DRFLAC_CACHE_L1_SELECT(setBitOffsetPlus1))) {
+                        decodedRice = (zeroCounter + (setBitOffsetPlus1-1)) << riceParam;
+                        break;
+                    }
+
+                    setBitOffsetPlus1 += 1;
+                }
+            }
+        }
+
+
+        unsigned int bitsLo = 0;
+        unsigned int riceLength = setBitOffsetPlus1 + riceParam;
+        if (riceLength < DRFLAC_CACHE_L1_BITS_REMAINING)
+        {
+            bitsLo = (unsigned int)((pFlac->cache & (riceParamMask >> setBitOffsetPlus1)) >> (DRFLAC_CACHE_L1_SIZE_BITS - riceLength));
+
+            pFlac->consumedBits += riceLength;
+            pFlac->cache <<= riceLength;
+        }
+        else
+        {
+            pFlac->consumedBits += riceLength;
+            pFlac->cache <<= setBitOffsetPlus1;
+
+            // It straddles the cached data. It will never cover more than the next chunk. We just read the number in two parts and combine them.
+            size_t bitCountLo = pFlac->consumedBits - DRFLAC_CACHE_L1_SIZE_BITS;
+            drflac_cache_t resultHi = pFlac->cache & riceParamMask;    // <-- This mask is OK because all bits after the first bits are always zero.
+
+
+            if (pFlac->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT) {
+                pFlac->cache = drflac__be2host__cache_line(pFlac->cacheL2[pFlac->nextL2Line++]);
+            } else {
+                // Slow path. We need to fetch more data from the client.
+                if (!drflac__reload_cache(pFlac)) {
+                    return false;
+                }
+            }
+
+            bitsLo = (unsigned int)((resultHi >> resultHiShift) | DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bitCountLo));
+            pFlac->consumedBits = bitCountLo;
+            pFlac->cache <<= bitCountLo;
+        }
+
+
+        decodedRice |= bitsLo;
+        if ((decodedRice & 0x01)) {
+            decodedRice = ~(decodedRice >> 1);
+        } else {
+            decodedRice = (decodedRice >> 1);
+        }
+
+
+        // In order to properly calculate the prediction when the bits per sample is >16 we need to do it using 64-bit arithmetic. We can assume this
+        // is probably going to be slower on 32-bit systems so we'll do a more optimized 32-bit version when the bits per sample is low enough.
+        if (pFlac->currentFrame.bitsPerSample > 16) {
+            pSamplesOut[i] = ((int)decodedRice + drflac__calculate_prediction(order, shift, coefficients, pSamplesOut + i));
+        } else {
+            pSamplesOut[i] = ((int)decodedRice + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + i));
+        }
+    }
+
+    return true;
+}
+
+
+// Reads and seeks past a string of residual values as Rice codes. The decoder should be sitting on the first bit of the Rice codes.
+static bool drflac__read_and_seek_residual__rice(drflac* pFlac, unsigned int count, unsigned char riceParam)
+{
+    assert(pFlac != NULL);
+    assert(count > 0);
+
+    for (unsigned int i = 0; i < count; ++i) {
+        if (!drflac__read_and_seek_rice(pFlac, riceParam)) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static bool drflac__decode_samples_with_residual__unencoded(drflac* pFlac, unsigned int count, unsigned char unencodedBitsPerSample, unsigned int order, int shift, const short* coefficients, int* pSamplesOut)
+{
+    assert(pFlac != NULL);
+    assert(count > 0);
+    assert(unencodedBitsPerSample > 0 && unencodedBitsPerSample <= 32);
+    assert(pSamplesOut != NULL);
+
+    for (unsigned int i = 0; i < count; ++i)
+    {
+        if (!drflac__read_int32(pFlac, unencodedBitsPerSample, pSamplesOut + i)) {
+            return false;
+        }
+
+        pSamplesOut[i] += drflac__calculate_prediction(order, shift, coefficients, pSamplesOut + i);
+    }
+
+    return true;
+}
+
+
+// Reads and decodes the residual for the sub-frame the decoder is currently sitting on. This function should be called
+// when the decoder is sitting at the very start of the RESIDUAL block. The first <order> residuals will be ignored. The
+// <blockSize> and <order> parameters are used to determine how many residual values need to be decoded.
+static bool drflac__decode_samples_with_residual(drflac* pFlac, unsigned int blockSize, unsigned int order, int shift, const short* coefficients, int* pDecodedSamples)
+{
+    assert(pFlac != NULL);
+    assert(blockSize != 0);
+    assert(pDecodedSamples != NULL);       // <-- Should we allow NULL, in which case we just seek past the residual rather than do a full decode?
+
+    unsigned char residualMethod;
+    if (!drflac__read_uint8(pFlac, 2, &residualMethod)) {
+        return false;
+    }
+
+    if (residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE && residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
+        return false;    // Unknown or unsupported residual coding method.
+    }
+
+    // Ignore the first <order> values.
+    pDecodedSamples += order;
+
+
+    unsigned char partitionOrder;
+    if (!drflac__read_uint8(pFlac, 4, &partitionOrder)) {
+        return false;
+    }
+
+
+    unsigned int samplesInPartition = (blockSize / (1 << partitionOrder)) - order;
+    unsigned int partitionsRemaining = (1 << partitionOrder);
+    for (;;)
+    {
+        unsigned char riceParam = 0;
+        if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE) {
+            if (!drflac__read_uint8(pFlac, 4, &riceParam)) {
+                return false;
+            }
+            if (riceParam == 16) {
+                riceParam = 0xFF;
+            }
+        } else if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
+            if (!drflac__read_uint8(pFlac, 5, &riceParam)) {
+                return false;
+            }
+            if (riceParam == 32) {
+                riceParam = 0xFF;
+            }
+        }
+
+        if (riceParam != 0xFF) {
+            if (!drflac__decode_samples_with_residual__rice(pFlac, samplesInPartition, riceParam, order, shift, coefficients, pDecodedSamples)) {
+                return false;
+            }
+        } else {
+            unsigned char unencodedBitsPerSample = 0;
+            if (!drflac__read_uint8(pFlac, 5, &unencodedBitsPerSample)) {
+                return false;
+            }
+
+            if (!drflac__decode_samples_with_residual__unencoded(pFlac, samplesInPartition, unencodedBitsPerSample, order, shift, coefficients, pDecodedSamples)) {
+                return false;
+            }
+        }
+
+        pDecodedSamples += samplesInPartition;
+
+
+        if (partitionsRemaining == 1) {
+            break;
+        }
+
+        partitionsRemaining -= 1;
+        samplesInPartition = blockSize / (1 << partitionOrder);
+    }
+
+    return true;
+}
+
+// Reads and seeks past the residual for the sub-frame the decoder is currently sitting on. This function should be called
+// when the decoder is sitting at the very start of the RESIDUAL block. The first <order> residuals will be set to 0. The
+// <blockSize> and <order> parameters are used to determine how many residual values need to be decoded.
+static bool drflac__read_and_seek_residual(drflac* pFlac, unsigned int blockSize, unsigned int order)
+{
+    assert(pFlac != NULL);
+    assert(blockSize != 0);
+
+    unsigned char residualMethod;
+    if (!drflac__read_uint8(pFlac, 2, &residualMethod)) {
+        return false;
+    }
+
+    if (residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE && residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
+        return false;    // Unknown or unsupported residual coding method.
+    }
+
+    unsigned char partitionOrder;
+    if (!drflac__read_uint8(pFlac, 4, &partitionOrder)) {
+        return false;
+    }
+
+    unsigned int samplesInPartition = (blockSize / (1 << partitionOrder)) - order;
+    unsigned int partitionsRemaining = (1 << partitionOrder);
+    for (;;)
+    {
+        unsigned char riceParam = 0;
+        if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE) {
+            if (!drflac__read_uint8(pFlac, 4, &riceParam)) {
+                return false;
+            }
+            if (riceParam == 16) {
+                riceParam = 0xFF;
+            }
+        } else if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
+            if (!drflac__read_uint8(pFlac, 5, &riceParam)) {
+                return false;
+            }
+            if (riceParam == 32) {
+                riceParam = 0xFF;
+            }
+        }
+
+        if (riceParam != 0xFF) {
+            if (!drflac__read_and_seek_residual__rice(pFlac, samplesInPartition, riceParam)) {
+                return false;
+            }
+        } else {
+            unsigned char unencodedBitsPerSample = 0;
+            if (!drflac__read_uint8(pFlac, 5, &unencodedBitsPerSample)) {
+                return false;
+            }
+
+            if (!drflac__seek_bits(pFlac, unencodedBitsPerSample * samplesInPartition)) {
+                return false;
+            }
+        }
+
+
+        if (partitionsRemaining == 1) {
+            break;
+        }
+
+        partitionsRemaining -= 1;
+        samplesInPartition = blockSize / (1 << partitionOrder);
+    }
+
+    return true;
+}
+
+
+static bool drflac__decode_samples__constant(drflac* pFlac, drflac_subframe* pSubframe)
+{
+    // Only a single sample needs to be decoded here.
+    int sample;
+    if (!drflac__read_int32(pFlac, pSubframe->bitsPerSample, &sample)) {
+        return false;
+    }
+
+    // We don't really need to expand this, but it does simplify the process of reading samples. If this becomes a performance issue (unlikely)
+    // we'll want to look at a more efficient way.
+    for (unsigned int i = 0; i < pFlac->currentFrame.blockSize; ++i) {
+        pSubframe->pDecodedSamples[i] = sample;
+    }
+
+    return true;
+}
+
+static bool drflac__decode_samples__verbatim(drflac* pFlac, drflac_subframe* pSubframe)
+{
+    for (unsigned int i = 0; i < pFlac->currentFrame.blockSize; ++i) {
+        int sample;
+        if (!drflac__read_int32(pFlac, pSubframe->bitsPerSample, &sample)) {
+            return false;
+        }
+
+        pSubframe->pDecodedSamples[i] = sample;
+    }
+
+    return true;
+}
+
+static bool drflac__decode_samples__fixed(drflac* pFlac, drflac_subframe* pSubframe)
+{
+    short lpcCoefficientsTable[5][4] = {
+        {0,  0, 0,  0},
+        {1,  0, 0,  0},
+        {2, -1, 0,  0},
+        {3, -3, 1,  0},
+        {4, -6, 4, -1}
+    };
+
+    // Warm up samples and coefficients.
+    for (unsigned int i = 0; i < pSubframe->lpcOrder; ++i) {
+        int sample;
+        if (!drflac__read_int32(pFlac, pSubframe->bitsPerSample, &sample)) {
+            return false;
+        }
+
+        pSubframe->pDecodedSamples[i] = sample;
+    }
+
+
+    if (!drflac__decode_samples_with_residual(pFlac, pFlac->currentFrame.blockSize, pSubframe->lpcOrder, 0, lpcCoefficientsTable[pSubframe->lpcOrder], pSubframe->pDecodedSamples)) {
+        return false;
+    }
+
+    return true;
+}
+
+static bool drflac__decode_samples__lpc(drflac* pFlac, drflac_subframe* pSubframe)
+{
+    // Warm up samples.
+    for (unsigned int i = 0; i < pSubframe->lpcOrder; ++i) {
+        int sample;
+        if (!drflac__read_int32(pFlac, pSubframe->bitsPerSample, &sample)) {
+            return false;
+        }
+
+        pSubframe->pDecodedSamples[i] = sample;
+    }
+
+    unsigned char lpcPrecision;
+    if (!drflac__read_uint8(pFlac, 4, &lpcPrecision)) {
+        return false;
+    }
+    if (lpcPrecision == 15) {
+        return false;    // Invalid.
+    }
+    lpcPrecision += 1;
+
+
+    signed char lpcShift;
+    if (!drflac__read_int8(pFlac, 5, &lpcShift)) {
+        return false;
+    }
+
+
+    short coefficients[32];
+    for (unsigned int i = 0; i < pSubframe->lpcOrder; ++i) {
+        if (!drflac__read_int16(pFlac, lpcPrecision, coefficients + i)) {
+            return false;
+        }
+    }
+
+    if (!drflac__decode_samples_with_residual(pFlac, pFlac->currentFrame.blockSize, pSubframe->lpcOrder, lpcShift, coefficients, pSubframe->pDecodedSamples)) {
+        return false;
+    }
+
+    return true;
+}
+
+
+static bool drflac__read_next_frame_header(drflac* pFlac)
+{
+    assert(pFlac != NULL);
+    assert(pFlac->onRead != NULL);
+
+    // At the moment the sync code is as a form of basic validation. The CRC is stored, but is unused at the moment. This
+    // should probably be handled better in the future.
+
+    const int sampleRateTable[12]       = {0, 88200, 176400, 192000, 8000, 16000, 22050, 24000, 32000, 44100, 48000, 96000};
+    const uint8_t bitsPerSampleTable[8] = {0, 8, 12, (uint8_t)-1, 16, 20, 24, (uint8_t)-1};   // -1 = reserved.
+
+    unsigned short syncCode = 0;
+    if (!drflac__read_uint16(pFlac, 14, &syncCode)) {
+        return false;
+    }
+
+    if (syncCode != 0x3FFE) {
+        // TODO: Try and recover by attempting to seek to and read the next frame?
+        return false;
+    }
+
+    unsigned char reserved;
+    if (!drflac__read_uint8(pFlac, 1, &reserved)) {
+        return false;
+    }
+
+    unsigned char blockingStrategy = 0;
+    if (!drflac__read_uint8(pFlac, 1, &blockingStrategy)) {
+        return false;
+    }
+
+
+
+    unsigned char blockSize = 0;
+    if (!drflac__read_uint8(pFlac, 4, &blockSize)) {
+        return false;
+    }
+
+    unsigned char sampleRate = 0;
+    if (!drflac__read_uint8(pFlac, 4, &sampleRate)) {
+        return false;
+    }
+
+    unsigned char channelAssignment = 0;
+    if (!drflac__read_uint8(pFlac, 4, &channelAssignment)) {
+        return false;
+    }
+
+    unsigned char bitsPerSample = 0;
+    if (!drflac__read_uint8(pFlac, 3, &bitsPerSample)) {
+        return false;
+    }
+
+    if (!drflac__read_uint8(pFlac, 1, &reserved)) {
+        return false;
+    }
+
+
+    unsigned char isVariableBlockSize = blockingStrategy == 1;
+    if (isVariableBlockSize) {
+        pFlac->currentFrame.frameNumber = 0;
+        if (!drflac__read_utf8_coded_number(pFlac, &pFlac->currentFrame.sampleNumber)) {
+            return false;
+        }
+    } else {
+        unsigned long long frameNumber = 0;
+        if (!drflac__read_utf8_coded_number(pFlac, &frameNumber)) {
+            return false;
+        }
+        pFlac->currentFrame.frameNumber  = (unsigned int)frameNumber;   // <-- Safe cast.
+        pFlac->currentFrame.sampleNumber = 0;
+    }
+
+
+    if (blockSize == 1) {
+        pFlac->currentFrame.blockSize = 192;
+    } else if (blockSize >= 2 && blockSize <= 5) {
+        pFlac->currentFrame.blockSize = 576 * (1 << (blockSize - 2));
+    } else if (blockSize == 6) {
+        if (!drflac__read_uint16(pFlac, 8, &pFlac->currentFrame.blockSize)) {
+            return false;
+        }
+        pFlac->currentFrame.blockSize += 1;
+    } else if (blockSize == 7) {
+        if (!drflac__read_uint16(pFlac, 16, &pFlac->currentFrame.blockSize)) {
+            return false;
+        }
+        pFlac->currentFrame.blockSize += 1;
+    } else {
+        pFlac->currentFrame.blockSize = 256 * (1 << (blockSize - 8));
+    }
+
+
+    if (sampleRate <= 11) {
+        pFlac->currentFrame.sampleRate = sampleRateTable[sampleRate];
+    } else if (sampleRate == 12) {
+        if (!drflac__read_uint32(pFlac, 8, &pFlac->currentFrame.sampleRate)) {
+            return false;
+        }
+        pFlac->currentFrame.sampleRate *= 1000;
+    } else if (sampleRate == 13) {
+        if (!drflac__read_uint32(pFlac, 16, &pFlac->currentFrame.sampleRate)) {
+            return false;
+        }
+    } else if (sampleRate == 14) {
+        if (!drflac__read_uint32(pFlac, 16, &pFlac->currentFrame.sampleRate)) {
+            return false;
+        }
+        pFlac->currentFrame.sampleRate *= 10;
+    } else {
+        return false;  // Invalid.
+    }
+
+
+    pFlac->currentFrame.channelAssignment = channelAssignment;
+
+    pFlac->currentFrame.bitsPerSample = bitsPerSampleTable[bitsPerSample];
+    if (pFlac->currentFrame.bitsPerSample == 0) {
+        pFlac->currentFrame.bitsPerSample = pFlac->bitsPerSample;
+    }
+
+    if (drflac__read_uint8(pFlac, 8, &pFlac->currentFrame.crc8) != 1) {
+        return false;
+    }
+
+    memset(pFlac->currentFrame.subframes, 0, sizeof(pFlac->currentFrame.subframes));
+
+    return true;
+}
+
+static bool drflac__read_subframe_header(drflac* pFlac, drflac_subframe* pSubframe)
+{
+    unsigned char header;
+    if (!drflac__read_uint8(pFlac, 8, &header)) {
+        return false;
+    }
+
+    // First bit should always be 0.
+    if ((header & 0x80) != 0) {
+        return false;
+    }
+
+    int type = (header & 0x7E) >> 1;
+    if (type == 0) {
+        pSubframe->subframeType = DRFLAC_SUBFRAME_CONSTANT;
+    } else if (type == 1) {
+        pSubframe->subframeType = DRFLAC_SUBFRAME_VERBATIM;
+    } else {
+        if ((type & 0x20) != 0) {
+            pSubframe->subframeType = DRFLAC_SUBFRAME_LPC;
+            pSubframe->lpcOrder = (type & 0x1F) + 1;
+        } else if ((type & 0x08) != 0) {
+            pSubframe->subframeType = DRFLAC_SUBFRAME_FIXED;
+            pSubframe->lpcOrder = (type & 0x07);
+            if (pSubframe->lpcOrder > 4) {
+                pSubframe->subframeType = DRFLAC_SUBFRAME_RESERVED;
+                pSubframe->lpcOrder = 0;
+            }
+        } else {
+            pSubframe->subframeType = DRFLAC_SUBFRAME_RESERVED;
+        }
+    }
+
+    if (pSubframe->subframeType == DRFLAC_SUBFRAME_RESERVED) {
+        return false;
+    }
+
+    // Wasted bits per sample.
+    pSubframe->wastedBitsPerSample = 0;
+    if ((header & 0x01) == 1) {
+        unsigned int wastedBitsPerSample;
+        if (!drflac__seek_past_next_set_bit(pFlac, &wastedBitsPerSample)) {
+            return false;
+        }
+        pSubframe->wastedBitsPerSample = (unsigned char)wastedBitsPerSample + 1;
+    }
+
+    return true;
+}
+
+static bool drflac__decode_subframe(drflac* pFlac, int subframeIndex)
+{
+    assert(pFlac != NULL);
+
+    drflac_subframe* pSubframe = pFlac->currentFrame.subframes + subframeIndex;
+    if (!drflac__read_subframe_header(pFlac, pSubframe)) {
+        return false;
+    }
+
+    // Side channels require an extra bit per sample. Took a while to figure that one out...
+    pSubframe->bitsPerSample = pFlac->currentFrame.bitsPerSample;
+    if ((pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE || pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE) && subframeIndex == 1) {
+        pSubframe->bitsPerSample += 1;
+    } else if (pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE && subframeIndex == 0) {
+        pSubframe->bitsPerSample += 1;
+    }
+
+    // Need to handle wasted bits per sample.
+    pSubframe->bitsPerSample -= pSubframe->wastedBitsPerSample;
+    pSubframe->pDecodedSamples = pFlac->pDecodedSamples + (pFlac->currentFrame.blockSize * subframeIndex);
+
+    switch (pSubframe->subframeType)
+    {
+        case DRFLAC_SUBFRAME_CONSTANT:
+        {
+            drflac__decode_samples__constant(pFlac, pSubframe);
+        } break;
+
+        case DRFLAC_SUBFRAME_VERBATIM:
+        {
+            drflac__decode_samples__verbatim(pFlac, pSubframe);
+        } break;
+
+        case DRFLAC_SUBFRAME_FIXED:
+        {
+            drflac__decode_samples__fixed(pFlac, pSubframe);
+        } break;
+
+        case DRFLAC_SUBFRAME_LPC:
+        {
+            drflac__decode_samples__lpc(pFlac, pSubframe);
+        } break;
+
+        default: return false;
+    }
+
+    return true;
+}
+
+static bool drflac__seek_subframe(drflac* pFlac, int subframeIndex)
+{
+    assert(pFlac != NULL);
+
+    drflac_subframe* pSubframe = pFlac->currentFrame.subframes + subframeIndex;
+    if (!drflac__read_subframe_header(pFlac, pSubframe)) {
+        return false;
+    }
+
+    // Side channels require an extra bit per sample. Took a while to figure that one out...
+    pSubframe->bitsPerSample = pFlac->currentFrame.bitsPerSample;
+    if ((pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE || pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE) && subframeIndex == 1) {
+        pSubframe->bitsPerSample += 1;
+    } else if (pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE && subframeIndex == 0) {
+        pSubframe->bitsPerSample += 1;
+    }
+
+    // Need to handle wasted bits per sample.
+    pSubframe->bitsPerSample -= pSubframe->wastedBitsPerSample;
+    pSubframe->pDecodedSamples = pFlac->pDecodedSamples + (pFlac->currentFrame.blockSize * subframeIndex);
+
+    switch (pSubframe->subframeType)
+    {
+        case DRFLAC_SUBFRAME_CONSTANT:
+        {
+            if (!drflac__seek_bits(pFlac, pSubframe->bitsPerSample)) {
+                return false;
+            }
+        } break;
+
+        case DRFLAC_SUBFRAME_VERBATIM:
+        {
+            unsigned int bitsToSeek = pFlac->currentFrame.blockSize * pSubframe->bitsPerSample;
+            if (!drflac__seek_bits(pFlac, bitsToSeek)) {
+                return false;
+            }
+        } break;
+
+        case DRFLAC_SUBFRAME_FIXED:
+        {
+            unsigned int bitsToSeek = pSubframe->lpcOrder * pSubframe->bitsPerSample;
+            if (!drflac__seek_bits(pFlac, bitsToSeek)) {
+                return false;
+            }
+
+            if (!drflac__read_and_seek_residual(pFlac, pFlac->currentFrame.blockSize, pSubframe->lpcOrder)) {
+                return false;
+            }
+        } break;
+
+        case DRFLAC_SUBFRAME_LPC:
+        {
+            unsigned int bitsToSeek = pSubframe->lpcOrder * pSubframe->bitsPerSample;
+            if (!drflac__seek_bits(pFlac, bitsToSeek)) {
+                return false;
+            }
+
+            unsigned char lpcPrecision;
+            if (!drflac__read_uint8(pFlac, 4, &lpcPrecision)) {
+                return false;
+            }
+            if (lpcPrecision == 15) {
+                return false;    // Invalid.
+            }
+            lpcPrecision += 1;
+
+
+            bitsToSeek = (pSubframe->lpcOrder * lpcPrecision) + 5;    // +5 for shift.
+            if (!drflac__seek_bits(pFlac, bitsToSeek)) {
+                return false;
+            }
+
+            if (!drflac__read_and_seek_residual(pFlac, pFlac->currentFrame.blockSize, pSubframe->lpcOrder)) {
+                return false;
+            }
+        } break;
+
+        default: return false;
+    }
+
+    return true;
+}
+
+
+static DRFLAC_INLINE int drflac__get_channel_count_from_channel_assignment(int channelAssignment)
+{
+    assert(channelAssignment <= 10);
+
+    int lookup[] = {1, 2, 3, 4, 5, 6, 7, 8, 2, 2, 2};
+    return lookup[channelAssignment];
+}
+
+static bool drflac__decode_frame(drflac* pFlac)
+{
+    // This function should be called while the stream is sitting on the first byte after the frame header.
+
+    int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFrame.channelAssignment);
+    for (int i = 0; i < channelCount; ++i)
+    {
+        if (!drflac__decode_subframe(pFlac, i)) {
+            return false;
+        }
+    }
+
+    // At the end of the frame sits the padding and CRC. We don't use these so we can just seek past.
+    if (!drflac__seek_bits(pFlac, (DRFLAC_CACHE_L1_BITS_REMAINING & 7) + 16)) {
+        return false;
+    }
+
+
+    pFlac->currentFrame.samplesRemaining = pFlac->currentFrame.blockSize * channelCount;
+
+    return true;
+}
+
+static bool drflac__seek_frame(drflac* pFlac)
+{
+    int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFrame.channelAssignment);
+    for (int i = 0; i < channelCount; ++i)
+    {
+        if (!drflac__seek_subframe(pFlac, i)) {
+            return false;
+        }
+    }
+
+    // Padding and CRC.
+    return drflac__seek_bits(pFlac, (DRFLAC_CACHE_L1_BITS_REMAINING & 7) + 16);
+}
+
+static bool drflac__read_and_decode_next_frame(drflac* pFlac)
+{
+    assert(pFlac != NULL);
+
+    if (!drflac__read_next_frame_header(pFlac)) {
+        return false;
+    }
+
+    return drflac__decode_frame(pFlac);
+}
+
+static unsigned int drflac__read_block_header(drflac* pFlac, unsigned int* pBlockSizeOut, bool* pIsLastBlockOut)    // Returns the block type.
+{
+    assert(pFlac != NULL);
+
+    unsigned char isLastBlock = 1;
+    unsigned char blockType = DRFLAC_BLOCK_TYPE_INVALID;
+    unsigned int blockSize = 0;
+
+    if (!drflac__read_uint8(pFlac, 1, &isLastBlock)) {
+        goto done_reading_block_header;
+    }
+
+    if (!drflac__read_uint8(pFlac, 7, &blockType)) {
+        goto done_reading_block_header;
+    }
+
+    if (!drflac__read_uint32(pFlac, 24, &blockSize)) {
+        goto done_reading_block_header;
+    }
+
+
+done_reading_block_header:
+    if (pBlockSizeOut) {
+        *pBlockSizeOut = blockSize;
+    }
+
+    if (pIsLastBlockOut) {
+        *pIsLastBlockOut = isLastBlock;
+    }
+
+    return blockType;
+}
+
+
+static void drflac__get_current_frame_sample_range(drflac* pFlac, uint64_t* pFirstSampleInFrameOut, uint64_t* pLastSampleInFrameOut)
+{
+    assert(pFlac != NULL);
+
+    unsigned int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFrame.channelAssignment);
+
+    uint64_t firstSampleInFrame = pFlac->currentFrame.sampleNumber;
+    if (firstSampleInFrame == 0) {
+        firstSampleInFrame = pFlac->currentFrame.frameNumber * pFlac->maxBlockSize*channelCount;
+    }
+
+    uint64_t lastSampleInFrame = firstSampleInFrame + (pFlac->currentFrame.blockSize*channelCount);
+    if (lastSampleInFrame > 0) {
+        lastSampleInFrame -= 1; // Needs to be zero based.
+    }
+
+
+    if (pFirstSampleInFrameOut) {
+        *pFirstSampleInFrameOut = firstSampleInFrame;
+    }
+    if (pLastSampleInFrameOut) {
+        *pLastSampleInFrameOut = lastSampleInFrame;
+    }
+}
+
+static bool drflac__seek_to_first_frame(drflac* pFlac)
+{
+    assert(pFlac != NULL);
+
+    bool result = drflac__seek_to_byte(pFlac, (long long)pFlac->firstFramePos);
+    pFlac->consumedBits = DRFLAC_CACHE_L1_SIZE_BITS;
+    pFlac->cache = 0;
+
+    memset(&pFlac->currentFrame, 0, sizeof(pFlac->currentFrame));
+
+
+    return result;
+}
+
+static DRFLAC_INLINE bool drflac__seek_to_next_frame(drflac* pFlac)
+{
+    // This function should only ever be called while the decoder is sitting on the first byte past the FRAME_HEADER section.
+    assert(pFlac != NULL);
+    return drflac__seek_frame(pFlac);
+}
+
+static bool drflac__seek_to_frame_containing_sample(drflac* pFlac, uint64_t sampleIndex)
+{
+    assert(pFlac != NULL);
+
+    if (!drflac__seek_to_first_frame(pFlac)) {
+        return false;
+    }
+
+    uint64_t firstSampleInFrame = 0;
+    uint64_t lastSampleInFrame = 0;
+    for (;;)
+    {
+        // We need to read the frame's header in order to determine the range of samples it contains.
+        if (!drflac__read_next_frame_header(pFlac)) {
+            return false;
+        }
+
+        drflac__get_current_frame_sample_range(pFlac, &firstSampleInFrame, &lastSampleInFrame);
+        if (sampleIndex >= firstSampleInFrame && sampleIndex <= lastSampleInFrame) {
+            break;  // The sample is in this frame.
+        }
+
+        if (!drflac__seek_to_next_frame(pFlac)) {
+            return false;
+        }
+    }
+
+    // If we get here we should be right at the start of the frame containing the sample.
+    return true;
+}
+
+static bool drflac__seek_to_sample__brute_force(drflac* pFlac, uint64_t sampleIndex)
+{
+    if (!drflac__seek_to_frame_containing_sample(pFlac, sampleIndex)) {
+        return false;
+    }
+
+    // At this point we should be sitting on the first byte of the frame containing the sample. We need to decode every sample up to (but
+    // not including) the sample we're seeking to.
+    uint64_t firstSampleInFrame = 0;
+    drflac__get_current_frame_sample_range(pFlac, &firstSampleInFrame, NULL);
+
+    assert(firstSampleInFrame <= sampleIndex);
+    size_t samplesToDecode = (size_t)(sampleIndex - firstSampleInFrame);    // <-- Safe cast because the maximum number of samples in a frame is 65535.
+    if (samplesToDecode == 0) {
+        return true;
+    }
+
+    // At this point we are just sitting on the byte after the frame header. We need to decode the frame before reading anything from it.
+    if (!drflac__decode_frame(pFlac)) {
+        return false;
+    }
+
+    return drflac_read_s16(pFlac, samplesToDecode, NULL);
+}
+
+static bool drflac__seek_to_sample__seek_table(drflac* pFlac, uint64_t sampleIndex)
+{
+    assert(pFlac != NULL);
+
+    if (pFlac->seektableBlock.pos == 0) {
+        return false;
+    }
+
+    if (!drflac__seek_to_byte(pFlac, pFlac->seektableBlock.pos)) {
+        return false;
+    }
+
+    // The number of seek points is derived from the size of the SEEKTABLE block.
+    unsigned int seekpointCount = pFlac->seektableBlock.sizeInBytes / 18;   // 18 = the size of each seek point.
+    if (seekpointCount == 0) {
+        return false;   // Would this ever happen?
+    }
+
+
+    drflac_seekpoint closestSeekpoint = {0};
+
+    unsigned int seekpointsRemaining = seekpointCount;
+    while (seekpointsRemaining > 0)
+    {
+        drflac_seekpoint seekpoint;
+        if (!drflac__read_uint64(pFlac, 64, &seekpoint.firstSample)) {
+            break;
+        }
+        if (!drflac__read_uint64(pFlac, 64, &seekpoint.frameOffset)) {
+            break;
+        }
+        if (!drflac__read_uint16(pFlac, 16, &seekpoint.sampleCount)) {
+            break;
+        }
+
+        if (seekpoint.firstSample * pFlac->channels > sampleIndex) {
+            break;
+        }
+
+        closestSeekpoint = seekpoint;
+        seekpointsRemaining -= 1;
+    }
+
+    // At this point we should have found the seekpoint closest to our sample. We need to seek to it using basically the same
+    // technique as we use with the brute force method.
+    drflac__seek_to_byte(pFlac, pFlac->firstFramePos + closestSeekpoint.frameOffset);
+
+    uint64_t firstSampleInFrame = 0;
+    uint64_t lastSampleInFrame = 0;
+    for (;;)
+    {
+        // We need to read the frame's header in order to determine the range of samples it contains.
+        if (!drflac__read_next_frame_header(pFlac)) {
+            return false;
+        }
+
+        drflac__get_current_frame_sample_range(pFlac, &firstSampleInFrame, &lastSampleInFrame);
+        if (sampleIndex >= firstSampleInFrame && sampleIndex <= lastSampleInFrame) {
+            break;  // The sample is in this frame.
+        }
+
+        if (!drflac__seek_to_next_frame(pFlac)) {
+            return false;
+        }
+    }
+
+    assert(firstSampleInFrame <= sampleIndex);
+
+    // At this point we are just sitting on the byte after the frame header. We need to decode the frame before reading anything from it.
+    if (!drflac__decode_frame(pFlac)) {
+        return false;
+    }
+
+    size_t samplesToDecode = (size_t)(sampleIndex - firstSampleInFrame);    // <-- Safe cast because the maximum number of samples in a frame is 65535.
+    return drflac_read_s16(pFlac, samplesToDecode, NULL) == samplesToDecode;
+}
+
+
+static drflac* drflac_open(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData)
+{
+    if (onRead == NULL || onSeek == NULL) {
+        return false;
+    }
+
+    unsigned char id[4];
+    if (onRead(pUserData, id, 4) != 4 || id[0] != 'f' || id[1] != 'L' || id[2] != 'a' || id[3] != 'C') {
+        return false;    // Not a FLAC stream.
+    }
+
+    drflac tempFlac;
+    memset(&tempFlac, 0, sizeof(tempFlac));
+    tempFlac.onRead         = onRead;
+    tempFlac.onSeek         = onSeek;
+    tempFlac.pUserData      = pUserData;
+    tempFlac.currentBytePos = 4;
+    tempFlac.nextL2Line     = sizeof(tempFlac.cacheL2) / sizeof(tempFlac.cacheL2[0]); // <-- Initialize to this to force a client-side data retrieval right from the start.
+    tempFlac.consumedBits   = sizeof(tempFlac.cache)*8;
+
+    // The first metadata block should be the STREAMINFO block. We don't care about everything in here.
+    unsigned int blockSize;
+    bool isLastBlock;
+    int blockType = drflac__read_block_header(&tempFlac, &blockSize, &isLastBlock);
+    if (blockType != DRFLAC_BLOCK_TYPE_STREAMINFO && blockSize != 34) {
+        return false;
+    }
+
+    if (!drflac__seek_bits(&tempFlac, 16)) {   // minBlockSize
+        return false;
+    }
+    if (!drflac__read_uint16(&tempFlac, 16, &tempFlac.maxBlockSize)) {
+        return false;
+    }
+    if (!drflac__seek_bits(&tempFlac, 48)) {   // minFrameSize + maxFrameSize
+        return false;
+    }
+    if (!drflac__read_uint32(&tempFlac, 20, &tempFlac.sampleRate)) {
+        return false;
+    }
+    if (!drflac__read_uint8(&tempFlac, 3, &tempFlac.channels)) {
+        return false;
+    }
+    if (!drflac__read_uint8(&tempFlac, 5, &tempFlac.bitsPerSample)) {
+        return false;
+    }
+    if (!drflac__read_uint64(&tempFlac, 36, &tempFlac.totalSampleCount)) {
+        return false;
+    }
+    if (!drflac__seek_bits(&tempFlac, 128)) {  // MD5
+        return false;
+    }
+
+    tempFlac.channels += 1;
+    tempFlac.bitsPerSample += 1;
+    tempFlac.totalSampleCount *= tempFlac.channels;
+
+    while (!isLastBlock)
+    {
+        blockType = drflac__read_block_header(&tempFlac, &blockSize, &isLastBlock);
+
+        switch (blockType)
+        {
+            case DRFLAC_BLOCK_TYPE_APPLICATION:
+            {
+                tempFlac.applicationBlock.pos = drflac__tell(&tempFlac);
+                tempFlac.applicationBlock.sizeInBytes = blockSize;
+            } break;
+
+            case DRFLAC_BLOCK_TYPE_SEEKTABLE:
+            {
+                tempFlac.seektableBlock.pos = drflac__tell(&tempFlac);
+                tempFlac.seektableBlock.sizeInBytes = blockSize;
+            } break;
+
+            case DRFLAC_BLOCK_TYPE_VORBIS_COMMENT:
+            {
+                tempFlac.vorbisCommentBlock.pos = drflac__tell(&tempFlac);
+                tempFlac.vorbisCommentBlock.sizeInBytes = blockSize;
+            } break;
+
+            case DRFLAC_BLOCK_TYPE_CUESHEET:
+            {
+                tempFlac.cuesheetBlock.pos = drflac__tell(&tempFlac);
+                tempFlac.cuesheetBlock.sizeInBytes = blockSize;
+            } break;
+
+            case DRFLAC_BLOCK_TYPE_PICTURE:
+            {
+                tempFlac.pictureBlock.pos = drflac__tell(&tempFlac);
+                tempFlac.pictureBlock.sizeInBytes = blockSize;
+            } break;
+
+
+            // These blocks we either don't care about or aren't supporting.
+            case DRFLAC_BLOCK_TYPE_PADDING:
+            case DRFLAC_BLOCK_TYPE_INVALID:
+            default: break;
+        }
+
+        if (!drflac__seek_bits(&tempFlac, blockSize*8)) {
+            return false;
+        }
+    }
+
+
+    // At this point we should be sitting right at the start of the very first frame.
+    tempFlac.firstFramePos = drflac__tell(&tempFlac);
+
+    drflac* pFlac = (drflac*)malloc(sizeof(*pFlac) - sizeof(pFlac->pExtraData) + (tempFlac.maxBlockSize * tempFlac.channels * sizeof(int32_t)));
+    memcpy(pFlac, &tempFlac, sizeof(tempFlac) - sizeof(pFlac->pExtraData));
+    pFlac->pDecodedSamples = (int32_t*)pFlac->pExtraData;
+
+    return pFlac;
+}
+
+static void drflac_close(drflac* pFlac)
+{
+    if (pFlac == NULL) {
+        return;
+    }
+
+#ifndef DR_FLAC_NO_STDIO
+    // If we opened the file with drflac_open_file() we will want to close the file handle. We can know whether or not drflac_open_file()
+    // was used by looking at the callbacks.
+    if (pFlac->onRead == drflac__on_read_stdio) {
+#if defined(DR_OPUS_NO_WIN32_IO) || !defined(_WIN32)
+        fclose((FILE*)pFlac->pUserData);
+#else
+        CloseHandle((HANDLE)pFlac->pUserData);
+#endif
+    }
+#endif
+
+    // If we opened the file with drflac_open_memory() we will want to free() the user data.
+    if (pFlac->onRead == drflac__on_read_memory) {
+        free(pFlac->pUserData);
+    }
+
+    free(pFlac);
+}
+
+static uint64_t drflac__read_s16__misaligned(drflac* pFlac, uint64_t samplesToRead, int16_t* bufferOut)
+{
+    unsigned int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFrame.channelAssignment);
+
+    // We should never be calling this when the number of samples to read is >= the sample count.
+    assert(samplesToRead < channelCount);
+    assert(pFlac->currentFrame.samplesRemaining > 0 && samplesToRead <= pFlac->currentFrame.samplesRemaining);
+
+
+    uint64_t samplesRead = 0;
+    while (samplesToRead > 0)
+    {
+        uint64_t totalSamplesInFrame = pFlac->currentFrame.blockSize * channelCount;
+        uint64_t samplesReadFromFrameSoFar = totalSamplesInFrame - pFlac->currentFrame.samplesRemaining;
+        unsigned int channelIndex = samplesReadFromFrameSoFar % channelCount;
+
+        unsigned long long nextSampleInFrame = samplesReadFromFrameSoFar / channelCount;
+
+        int decodedSample = 0;
+        switch (pFlac->currentFrame.channelAssignment)
+        {
+            case DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
+            {
+                if (channelIndex == 0) {
+                    decodedSample = pFlac->currentFrame.subframes[channelIndex].pDecodedSamples[nextSampleInFrame];
+                } else {
+                    int side = pFlac->currentFrame.subframes[channelIndex + 0].pDecodedSamples[nextSampleInFrame];
+                    int left = pFlac->currentFrame.subframes[channelIndex - 1].pDecodedSamples[nextSampleInFrame];
+                    decodedSample = left - side;
+                }
+
+            } break;
+
+            case DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
+            {
+                if (channelIndex == 0) {
+                    int side  = pFlac->currentFrame.subframes[channelIndex + 0].pDecodedSamples[nextSampleInFrame];
+                    int right = pFlac->currentFrame.subframes[channelIndex + 1].pDecodedSamples[nextSampleInFrame];
+                    decodedSample = side + right;
+                } else {
+                    decodedSample = pFlac->currentFrame.subframes[channelIndex].pDecodedSamples[nextSampleInFrame];
+                }
+
+            } break;
+
+            case DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
+            {
+                int mid;
+                int side;
+                if (channelIndex == 0) {
+                    mid  = pFlac->currentFrame.subframes[channelIndex + 0].pDecodedSamples[nextSampleInFrame];
+                    side = pFlac->currentFrame.subframes[channelIndex + 1].pDecodedSamples[nextSampleInFrame];
+
+                    mid = (((unsigned int)mid) << 1) | (side & 0x01);
+                    decodedSample = (mid + side) >> 1;
+                } else {
+                    mid  = pFlac->currentFrame.subframes[channelIndex - 1].pDecodedSamples[nextSampleInFrame];
+                    side = pFlac->currentFrame.subframes[channelIndex + 0].pDecodedSamples[nextSampleInFrame];
+
+                    mid = (((unsigned int)mid) << 1) | (side & 0x01);
+                    decodedSample = (mid - side) >> 1;
+                }
+
+            } break;
+
+            case DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
+            default:
+            {
+                decodedSample = pFlac->currentFrame.subframes[channelIndex].pDecodedSamples[nextSampleInFrame];
+            } break;
+        }
+
+        int shift = (16 - pFlac->bitsPerSample) + pFlac->currentFrame.subframes[channelIndex].wastedBitsPerSample;
+        if (shift >= 0) {
+            decodedSample <<= shift;
+        } else {
+            decodedSample >>= -shift;
+        }
+
+        if (bufferOut) {
+            *bufferOut++ = decodedSample;
+        }
+
+        samplesRead += 1;
+        pFlac->currentFrame.samplesRemaining -= 1;
+        samplesToRead -= 1;
+    }
+
+    return samplesRead;
+}
+
+static uint64_t drflac__seek_forward_by_samples(drflac* pFlac, uint64_t samplesToRead)
+{
+    uint64_t samplesRead = 0;
+    while (samplesToRead > 0)
+    {
+        if (pFlac->currentFrame.samplesRemaining == 0)
+        {
+            if (!drflac__read_and_decode_next_frame(pFlac)) {
+                break;  // Couldn't read the next frame, so just break from the loop and return.
+            }
+        }
+        else
+        {
+            samplesRead += 1;
+            pFlac->currentFrame.samplesRemaining -= 1;
+            samplesToRead -= 1;
+        }
+    }
+
+    return samplesRead;
+}
+
+static uint64_t drflac_read_s16(drflac* pFlac, uint64_t samplesToRead, int16_t* bufferOut)
+{
+    // Note that <bufferOut> is allowed to be null, in which case this will be treated as something like a seek.
+    if (pFlac == NULL || samplesToRead == 0) {
+        return 0;
+    }
+
+    if (bufferOut == NULL) {
+        return drflac__seek_forward_by_samples(pFlac, samplesToRead);
+    }
+
+
+    uint64_t samplesRead = 0;
+    while (samplesToRead > 0)
+    {
+        // If we've run out of samples in this frame, go to the next.
+        if (pFlac->currentFrame.samplesRemaining == 0)
+        {
+            if (!drflac__read_and_decode_next_frame(pFlac)) {
+                break;  // Couldn't read the next frame, so just break from the loop and return.
+            }
+        }
+        else
+        {
+            // Here is where we grab the samples and interleave them.
+
+            unsigned int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFrame.channelAssignment);
+            uint64_t totalSamplesInFrame = pFlac->currentFrame.blockSize * channelCount;
+            uint64_t samplesReadFromFrameSoFar = totalSamplesInFrame - pFlac->currentFrame.samplesRemaining;
+
+            int misalignedSampleCount = samplesReadFromFrameSoFar % channelCount;
+            if (misalignedSampleCount > 0) {
+                uint64_t misalignedSamplesRead = drflac__read_s16__misaligned(pFlac, misalignedSampleCount, bufferOut);
+                samplesRead   += misalignedSamplesRead;
+                samplesReadFromFrameSoFar += misalignedSamplesRead;
+                bufferOut     += misalignedSamplesRead;
+                samplesToRead -= misalignedSamplesRead;
+            }
+
+
+            uint64_t alignedSampleCountPerChannel = samplesToRead / channelCount;
+            if (alignedSampleCountPerChannel > pFlac->currentFrame.samplesRemaining / channelCount) {
+                alignedSampleCountPerChannel = pFlac->currentFrame.samplesRemaining / channelCount;
+            }
+
+            uint64_t firstAlignedSampleInFrame = samplesReadFromFrameSoFar / channelCount;
+            int unusedBitsPerSample = 16 - pFlac->bitsPerSample;
+
+            if (unusedBitsPerSample >= 0) {
+                int lshift0 = unusedBitsPerSample + pFlac->currentFrame.subframes[0].wastedBitsPerSample;
+                int lshift1 = unusedBitsPerSample + pFlac->currentFrame.subframes[1].wastedBitsPerSample;
+
+                switch (pFlac->currentFrame.channelAssignment)
+                {
+                    case DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
+                    {
+                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
+                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+
+                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                            int left  = pDecodedSamples0[i];
+                            int side  = pDecodedSamples1[i];
+                            int right = left - side;
+
+                            bufferOut[i*2+0] = left  << lshift0;
+                            bufferOut[i*2+1] = right << lshift1;
+                        }
+                    } break;
+
+                    case DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
+                    {
+                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
+                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+
+                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                            int side  = pDecodedSamples0[i];
+                            int right = pDecodedSamples1[i];
+                            int left  = right + side;
+
+                            bufferOut[i*2+0] = left  << lshift0;
+                            bufferOut[i*2+1] = right << lshift1;
+                        }
+                    } break;
+
+                    case DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
+                    {
+                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
+                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+
+                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                            int side = pDecodedSamples1[i];
+                            int mid  = (((uint32_t)pDecodedSamples0[i]) << 1) | (side & 0x01);
+
+                            bufferOut[i*2+0] = ((mid + side) >> 1) << lshift0;
+                            bufferOut[i*2+1] = ((mid - side) >> 1) << lshift1;
+                        }
+                    } break;
+
+                    case DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
+                    default:
+                    {
+                        if (pFlac->currentFrame.channelAssignment == 1) // 1 = Stereo
+                        {
+                            // Stereo optimized inner loop unroll.
+                            const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
+                            const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+
+                            for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                                bufferOut[i*2+0] = pDecodedSamples0[i] << lshift0;
+                                bufferOut[i*2+1] = pDecodedSamples1[i] << lshift1;
+                            }
+                        }
+                        else
+                        {
+                            // Generic interleaving.
+                            for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                                for (unsigned int j = 0; j < channelCount; ++j) {
+                                    bufferOut[(i*channelCount)+j] = (pFlac->currentFrame.subframes[j].pDecodedSamples[firstAlignedSampleInFrame + i]) << (unusedBitsPerSample + pFlac->currentFrame.subframes[j].wastedBitsPerSample);
+                                }
+                            }
+                        }
+                    } break;
+                }
+            } else {
+                int rshift0 = -unusedBitsPerSample + pFlac->currentFrame.subframes[0].wastedBitsPerSample;
+                int rshift1 = -unusedBitsPerSample + pFlac->currentFrame.subframes[1].wastedBitsPerSample;
+
+                switch (pFlac->currentFrame.channelAssignment)
+                {
+                    case DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
+                    {
+                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
+                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+
+                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                            int left  = pDecodedSamples0[i];
+                            int side  = pDecodedSamples1[i];
+                            int right = left - side;
+
+                            bufferOut[i*2+0] = left  >> rshift0;
+                            bufferOut[i*2+1] = right >> rshift1;
+                        }
+                    } break;
+
+                    case DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
+                    {
+                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
+                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+
+                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                            int side  = pDecodedSamples0[i];
+                            int right = pDecodedSamples1[i];
+                            int left  = right + side;
+
+                            bufferOut[i*2+0] = left  >> rshift0;
+                            bufferOut[i*2+1] = right >> rshift1;
+                        }
+                    } break;
+
+                    case DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
+                    {
+                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
+                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+
+                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                            int side = pDecodedSamples1[i];
+                            int mid  = (((uint32_t)pDecodedSamples0[i]) << 1) | (side & 0x01);
+
+                            bufferOut[i*2+0] = ((mid + side) >> 1) >> rshift0;
+                            bufferOut[i*2+1] = ((mid - side) >> 1) >> rshift1;
+                        }
+                    } break;
+
+                    case DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
+                    default:
+                    {
+                        if (pFlac->currentFrame.channelAssignment == 1) // 1 = Stereo
+                        {
+                            // Stereo optimized inner loop unroll.
+                            const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
+                            const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;
+
+                            for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                                bufferOut[i*2+0] = pDecodedSamples0[i] >> rshift0;
+                                bufferOut[i*2+1] = pDecodedSamples1[i] >> rshift1;
+                            }
+                        }
+                        else
+                        {
+                            // Generic interleaving.
+                            for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
+                                for (unsigned int j = 0; j < channelCount; ++j) {
+                                    bufferOut[(i*channelCount)+j] = (pFlac->currentFrame.subframes[j].pDecodedSamples[firstAlignedSampleInFrame + i]) >> (pFlac->currentFrame.subframes[j].wastedBitsPerSample - unusedBitsPerSample);
+                                }
+                            }
+                        }
+                    } break;
+                }
+            }
+
+            uint64_t alignedSamplesRead = alignedSampleCountPerChannel * channelCount;
+            samplesRead   += alignedSamplesRead;
+            samplesReadFromFrameSoFar += alignedSamplesRead;
+            bufferOut     += alignedSamplesRead;
+            samplesToRead -= alignedSamplesRead;
+            pFlac->currentFrame.samplesRemaining -= (unsigned int)alignedSamplesRead;
+
+
+
+            // At this point we may still have some excess samples left to read.
+            if (samplesToRead > 0 && pFlac->currentFrame.samplesRemaining > 0)
+            {
+                uint64_t excessSamplesRead = 0;
+                if (samplesToRead < pFlac->currentFrame.samplesRemaining) {
+                    excessSamplesRead = drflac__read_s16__misaligned(pFlac, samplesToRead, bufferOut);
+                } else {
+                    excessSamplesRead = drflac__read_s16__misaligned(pFlac, pFlac->currentFrame.samplesRemaining, bufferOut);
+                }
+
+                samplesRead   += excessSamplesRead;
+                samplesReadFromFrameSoFar += excessSamplesRead;
+                bufferOut     += excessSamplesRead;
+                samplesToRead -= excessSamplesRead;
+            }
+        }
+    }
+
+    return samplesRead;
+}
+
+static bool drflac_seek_to_sample(drflac* pFlac, uint64_t sampleIndex)
+{
+    if (pFlac == NULL) {
+        return false;
+    }
+
+    if (sampleIndex == 0) {
+        return drflac__seek_to_first_frame(pFlac);
+    }
+
+    // Clamp the sample to the end.
+    if (sampleIndex >= pFlac->totalSampleCount) {
+        sampleIndex  = pFlac->totalSampleCount - 1;
+    }
+
+
+    // First try seeking via the seek table. If this fails, fall back to a brute force seek which is much slower.
+    if (!drflac__seek_to_sample__seek_table(pFlac, sampleIndex)) {
+        return drflac__seek_to_sample__brute_force(pFlac, sampleIndex);
+    }
+
+    return true;
+}
+
+
+#endif  //DR_FLAC_IMPLEMENTATION
+
+
+/*
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org/>
+*/
diff --git a/panda/src/movies/flacAudio.I b/panda/src/movies/flacAudio.I
new file mode 100644
index 0000000000..0c4a8926db
--- /dev/null
+++ b/panda/src/movies/flacAudio.I
@@ -0,0 +1,12 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file flacAudio.I
+ * @author rdb
+ * @date 2016-04-27
+ */
diff --git a/panda/src/movies/flacAudio.cxx b/panda/src/movies/flacAudio.cxx
new file mode 100644
index 0000000000..8bfbd56ad2
--- /dev/null
+++ b/panda/src/movies/flacAudio.cxx
@@ -0,0 +1,64 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file flacAudio.cxx
+ * @author rdb
+ * @date 2016-04-27
+ */
+
+#include "flacAudio.h"
+#include "flacAudioCursor.h"
+#include "virtualFileSystem.h"
+#include "dcast.h"
+
+TypeHandle FlacAudio::_type_handle;
+
+/**
+ * xxx
+ */
+FlacAudio::
+FlacAudio(const Filename &name) :
+  MovieAudio(name)
+{
+  _filename = name;
+}
+
+/**
+ * xxx
+ */
+FlacAudio::
+~FlacAudio() {
+}
+
+/**
+ * Open this audio, returning a MovieAudioCursor
+ */
+PT(MovieAudioCursor) FlacAudio::
+open() {
+  VirtualFileSystem *vfs = VirtualFileSystem::get_global_ptr();
+  istream *stream = vfs->open_read_file(_filename, true);
+
+  if (stream == NULL) {
+    return NULL;
+  } else {
+    PT(FlacAudioCursor) cursor = new FlacAudioCursor(this, stream);
+    if (cursor == NULL || !cursor->_is_valid) {
+      return NULL;
+    } else {
+      return DCAST(MovieAudioCursor, cursor);
+    }
+  }
+}
+
+/**
+ * Obtains a MovieAudio that references a file.
+ */
+PT(MovieAudio) FlacAudio::
+make(const Filename &name) {
+  return DCAST(MovieAudio, new FlacAudio(name));
+}
diff --git a/panda/src/movies/flacAudio.h b/panda/src/movies/flacAudio.h
new file mode 100644
index 0000000000..4fb9818930
--- /dev/null
+++ b/panda/src/movies/flacAudio.h
@@ -0,0 +1,54 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file flacAudio.h
+ * @author rdb
+ * @date 2016-04-27
+ */
+
+#ifndef FLACAUDIO_H
+#define FLACAUDIO_H
+
+#include "pandabase.h"
+#include "movieAudio.h"
+
+class FlacAudioCursor;
+
+/**
+ * Reads FLAC audio files.  Ogg-encapsulated FLAC files are not supported.
+ */
+class EXPCL_PANDA_MOVIES FlacAudio : public MovieAudio {
+PUBLISHED:
+  FlacAudio(const Filename &name);
+  virtual ~FlacAudio();
+  virtual PT(MovieAudioCursor) open();
+
+  static PT(MovieAudio) make(const Filename &name);
+
+private:
+  friend class FlacAudioCursor;
+
+public:
+  static TypeHandle get_class_type() {
+    return _type_handle;
+  }
+  static void init_type() {
+    MovieAudio::init_type();
+    register_type(_type_handle, "FlacAudio",
+                  MovieAudio::get_class_type());
+  }
+  virtual TypeHandle get_type() const {
+    return get_class_type();
+  }
+  virtual TypeHandle force_init_type() {init_type(); return get_class_type();}
+
+private:
+  static TypeHandle _type_handle;
+};
+
+#endif // FLACAUDIO_H
diff --git a/panda/src/movies/flacAudioCursor.I b/panda/src/movies/flacAudioCursor.I
new file mode 100644
index 0000000000..c01b9a80fa
--- /dev/null
+++ b/panda/src/movies/flacAudioCursor.I
@@ -0,0 +1,12 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file vorbisAudioCursor.I
+ * @author rdb
+ * @date 2013-08-23
+ */
diff --git a/panda/src/movies/flacAudioCursor.cxx b/panda/src/movies/flacAudioCursor.cxx
new file mode 100644
index 0000000000..5618062536
--- /dev/null
+++ b/panda/src/movies/flacAudioCursor.cxx
@@ -0,0 +1,120 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file flacAudioCursor.cxx
+ * @author rdb
+ * @date 2013-08-23
+ */
+
+#include "flacAudioCursor.h"
+#include "virtualFileSystem.h"
+
+#define DR_FLAC_IMPLEMENTATION
+#define DR_FLAC_NO_STDIO
+extern "C" {
+  #include "dr_flac.h"
+}
+
+/**
+ * Callback passed to dr_flac to implement file I/O via the VirtualFileSystem.
+ */
+static size_t cb_read_proc(void *user, void *buffer, size_t size) {
+  istream *stream = (istream *)user;
+  nassertr(stream != NULL, false);
+
+  stream->read((char *)buffer, size);
+
+  if (stream->eof()) {
+    // Gracefully handle EOF.
+    stream->clear();
+  }
+
+  return stream->gcount();
+}
+
+/**
+ * Callback passed to dr_flac to implement file I/O via the VirtualFileSystem.
+ */
+static bool cb_seek_proc(void *user, int offset) {
+  istream *stream = (istream *)user;
+  nassertr(stream != NULL, false);
+
+  stream->seekg(offset, ios::cur);
+  return !stream->fail();
+}
+
+TypeHandle FlacAudioCursor::_type_handle;
+
+/**
+ * Reads the .wav header from the indicated stream.  This leaves the read
+ * pointer positioned at the start of the data.
+ */
+FlacAudioCursor::
+FlacAudioCursor(FlacAudio *src, istream *stream) :
+  MovieAudioCursor(src),
+  _is_valid(false),
+  _drflac(NULL)
+{
+  nassertv(stream != NULL);
+  nassertv(stream->good());
+
+  _drflac = drflac_open(&cb_read_proc, &cb_seek_proc, (void *)stream);
+
+  if (_drflac == NULL) {
+    movies_cat.error()
+      << "Failed to open FLAC file.\n";
+    _is_valid = false;
+  }
+
+  _length = (_drflac->totalSampleCount / _drflac->channels) / (double)_drflac->sampleRate;
+
+  _audio_channels = _drflac->channels;
+  _audio_rate = _drflac->sampleRate;
+
+  _can_seek = true;
+  _can_seek_fast = _can_seek;
+
+  _is_valid = true;
+}
+
+/**
+ * xxx
+ */
+FlacAudioCursor::
+~FlacAudioCursor() {
+  if (_drflac != NULL) {
+    drflac_close(_drflac);
+  }
+}
+
+/**
+ * Seeks to a target location.  Afterward, the packet_time is guaranteed to be
+ * less than or equal to the specified time.
+ */
+void FlacAudioCursor::
+seek(double t) {
+  t = max(t, 0.0);
+
+  uint64_t sample = t * _drflac->sampleRate;
+
+  if (drflac_seek_to_sample(_drflac, sample * _drflac->channels)) {
+    _last_seek = sample / (double)_drflac->sampleRate;
+    _samples_read = 0;
+  }
+}
+
+/**
+ * Read audio samples from the stream.  N is the number of samples you wish to
+ * read.  Your buffer must be equal in size to N * channels.  Multiple-channel
+ * audio will be interleaved.
+ */
+void FlacAudioCursor::
+read_samples(int n, PN_int16 *data) {
+  int desired = n * _audio_channels;
+  _samples_read += drflac_read_s16(_drflac, desired, data) / _audio_channels;
+}
diff --git a/panda/src/movies/flacAudioCursor.h b/panda/src/movies/flacAudioCursor.h
new file mode 100644
index 0000000000..2b4633c871
--- /dev/null
+++ b/panda/src/movies/flacAudioCursor.h
@@ -0,0 +1,65 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file flacAudioCursor.h
+ * @author rdb
+ * @date 2013-08-23
+ */
+
+#ifndef FLACAUDIOCURSOR_H
+#define FLACAUDIOCURSOR_H
+
+#include "pandabase.h"
+#include "movieAudioCursor.h"
+
+#define DR_FLAC_NO_STDIO
+extern "C" {
+  #include "dr_flac.h"
+}
+
+class FlacAudio;
+
+/**
+ * Interfaces with the libvorbisfile library to implement decoding of Ogg
+ * Vorbis audio files.
+ */
+class EXPCL_PANDA_MOVIES FlacAudioCursor : public MovieAudioCursor {
+PUBLISHED:
+  FlacAudioCursor(FlacAudio *src, istream *stream);
+  virtual ~FlacAudioCursor();
+  virtual void seek(double offset);
+
+public:
+  virtual void read_samples(int n, PN_int16 *data);
+
+  bool _is_valid;
+
+protected:
+  drflac *_drflac;
+
+public:
+  static TypeHandle get_class_type() {
+    return _type_handle;
+  }
+  static void init_type() {
+    MovieAudioCursor::init_type();
+    register_type(_type_handle, "FlacAudioCursor",
+                  MovieAudioCursor::get_class_type());
+  }
+  virtual TypeHandle get_type() const {
+    return get_class_type();
+  }
+  virtual TypeHandle force_init_type() {init_type(); return get_class_type();}
+
+private:
+  static TypeHandle _type_handle;
+};
+
+#include "flacAudioCursor.I"
+
+#endif // FLACAUDIOCURSOR_H
diff --git a/panda/src/movies/p3movies_composite1.cxx b/panda/src/movies/p3movies_composite1.cxx
index ecb10339a6..ea526c30b1 100644
--- a/panda/src/movies/p3movies_composite1.cxx
+++ b/panda/src/movies/p3movies_composite1.cxx
@@ -1,4 +1,6 @@
 #include "config_movies.cxx"
+#include "flacAudio.cxx"
+#include "flacAudioCursor.cxx"
 #include "inkblotVideo.cxx"
 #include "inkblotVideoCursor.cxx"
 #include "microphoneAudio.cxx"