diff --git a/lib/deflate_compress.c b/lib/deflate_compress.c
index a77314b..5049b13 100644
--- a/lib/deflate_compress.c
+++ b/lib/deflate_compress.c
@@ -491,10 +491,19 @@ struct deflate_output_bitstream {
 	u8 *end;
 };
 
-#define MIN_OUTPUT_SIZE	(UNALIGNED_ACCESS_IS_FAST ? sizeof(bitbuf_t) : 1)
+/*
+ * OUTPUT_END_PADDING is the size, in bytes, of the extra space that must be
+ * present following os->end, in order to not overrun the buffer when generating
+ * output.  When UNALIGNED_ACCESS_IS_FAST, we need at least sizeof(bitbuf_t)
+ * bytes for put_unaligned_leword().  Otherwise we need only 1 byte.  However,
+ * to make the compression algorithm produce the same result on all CPU
+ * architectures (which is sometimes desirable), we have to unconditionally use
+ * the maximum for any CPU, which is sizeof(bitbuf_t) == 8.
+ */
+#define OUTPUT_END_PADDING	8
 
 /* Initialize the output bitstream.  'size' is assumed to be at least
- * MIN_OUTPUT_SIZE.  */
+ * OUTPUT_END_PADDING.  */
 static void
 deflate_init_output(struct deflate_output_bitstream *os,
 		    void *buffer, size_t size)
@@ -503,7 +512,7 @@ deflate_init_output(struct deflate_output_bitstream *os,
 	os->bitcount = 0;
 	os->begin = buffer;
 	os->next = os->begin;
-	os->end = os->begin + size - MIN_OUTPUT_SIZE;
+	os->end = os->begin + size - OUTPUT_END_PADDING;
 }
 
 /* Add some bits to the bitbuffer variable of the output bitstream.  The caller
@@ -2774,7 +2783,7 @@ libdeflate_deflate_compress(struct libdeflate_compressor *c,
 			    const void *in, size_t in_nbytes,
 			    void *out, size_t out_nbytes_avail)
 {
-	if (unlikely(out_nbytes_avail < MIN_OUTPUT_SIZE))
+	if (unlikely(out_nbytes_avail < OUTPUT_END_PADDING))
 		return 0;
 
 	/* For extremely small inputs just use a single uncompressed block. */
@@ -2813,5 +2822,5 @@ libdeflate_deflate_compress_bound(struct libdeflate_compressor *c,
 	 * and alignment to a byte boundary; 2 for LEN; and 2 for NLEN.
 	 */
 	size_t max_num_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1);
-	return (5 * max_num_blocks) + in_nbytes + 1 + MIN_OUTPUT_SIZE;
+	return (5 * max_num_blocks) + in_nbytes + 1 + OUTPUT_END_PADDING;
 }