From 3ed8632a6807d82909b897ae0e91f3f4b970421b Mon Sep 17 00:00:00 2001
From: David Rose <david@ddrose.com>
Date: Mon, 5 Sep 2011 23:53:19 +0000
Subject: [PATCH] further bugfixes

---
 panda/src/framework/pandaFramework.cxx        |   1 +
 panda/src/gobj/geom.I                         |   3 +-
 panda/src/gobj/geom.cxx                       |  18 +--
 panda/src/gobj/geomPrimitive.cxx              |  19 +--
 panda/src/gobj/geomVertexData.cxx             |  16 ++-
 panda/src/gobj/geomVertexWriter.I             |   1 +
 panda/src/gobj/geomVertexWriter.h             |   2 +-
 panda/src/pgraph/cullableObject.cxx           |   2 +-
 panda/src/pgraph/renderAttrib.cxx             |  34 ++++--
 panda/src/pgraph/renderState.cxx              |  20 ++--
 panda/src/pgraph/textureAttrib.cxx            |   2 +
 panda/src/pgraph/transformState.cxx           |  26 ++--
 panda/src/pipeline/cycleDataLockedReader.h    |   2 +-
 panda/src/pipeline/cycleDataReader.h          |   2 +-
 panda/src/pipeline/cycleDataWriter.h          |   2 +-
 panda/src/pipeline/pipeline.cxx               |  13 +-
 panda/src/pipeline/pipelineCycler.h           |   8 +-
 panda/src/pipeline/pipelineCyclerTrueImpl.I   |  95 +++++++++++----
 panda/src/pipeline/pipelineCyclerTrueImpl.cxx | 112 +++++++++++-------
 panda/src/pipeline/pipelineCyclerTrueImpl.h   |  15 ++-
 panda/src/putil/simpleHashMap.I               |  30 ++++-
 panda/src/testbed/pgrid.cxx                   |   2 +-
 22 files changed, 297 insertions(+), 128 deletions(-)

diff --git a/panda/src/framework/pandaFramework.cxx b/panda/src/framework/pandaFramework.cxx
index f23d532021..465ed24477 100644
--- a/panda/src/framework/pandaFramework.cxx
+++ b/panda/src/framework/pandaFramework.cxx
@@ -1640,4 +1640,5 @@ AsyncTask::DoneStatus PandaFramework::
 task_garbage_collect(GenericAsyncTask *task, void *data) {
   TransformState::garbage_collect();
   RenderState::garbage_collect();
+  return AsyncTask::DS_cont;
 }
diff --git a/panda/src/gobj/geom.I b/panda/src/gobj/geom.I
index b2594c4869..5ea853472f 100644
--- a/panda/src/gobj/geom.I
+++ b/panda/src/gobj/geom.I
@@ -96,8 +96,7 @@ get_vertex_data(Thread *current_thread) const {
 INLINE bool Geom::
 is_empty() const {
   CDReader cdata(_cycler);
-  return (cdata->_data.get_read_pointer()->get_num_rows() == 0 ||
-          cdata->_primitives.empty());
+  return cdata->_primitives.empty();
 }
 
 ////////////////////////////////////////////////////////////////////
diff --git a/panda/src/gobj/geom.cxx b/panda/src/gobj/geom.cxx
index baee8cb26e..6de98138da 100644
--- a/panda/src/gobj/geom.cxx
+++ b/panda/src/gobj/geom.cxx
@@ -1724,8 +1724,15 @@ check_usage_hint() const {
     // already have modified the pointer on the object since we
     // queried it.
     {
+#ifdef DO_PIPELINING
+      unref_delete((CycleData *)_cdata);
+#endif
       Geom::CDWriter fresh_cdata(((Geom *)_object.p())->_cycler, 
                                  false, _current_thread);
+      ((GeomPipelineReader *)this)->_cdata = fresh_cdata;
+#ifdef DO_PIPELINING
+      _cdata->ref();
+#endif
       if (!fresh_cdata->_got_usage_hint) {
         // The cache is still stale.  We have to do the work of
         // freshening it.
@@ -1733,14 +1740,9 @@ check_usage_hint() const {
         nassertv(fresh_cdata->_got_usage_hint);
       }
 
-      // Save the new pointer, and then let the lock release itself.
-#ifdef DO_PIPELINING
-      unref_delete((CycleData *)_cdata);
-#endif
-      ((GeomPipelineReader *)this)->_cdata = fresh_cdata;
-#ifdef DO_PIPELINING
-      _cdata->ref();
-#endif
+      // When fresh_cdata goes out of scope, its write lock is
+      // released, and _cdata reverts to our usual convention of an
+      // unlocked copy of the data.
     }
   }
 
diff --git a/panda/src/gobj/geomPrimitive.cxx b/panda/src/gobj/geomPrimitive.cxx
index 4997a3f2bd..8169e18bbf 100644
--- a/panda/src/gobj/geomPrimitive.cxx
+++ b/panda/src/gobj/geomPrimitive.cxx
@@ -1983,8 +1983,16 @@ check_minmax() const {
     // already have modified the pointer on the object since we
     // queried it.
     {
+#ifdef DO_PIPELINING
+      unref_delete((CycleData *)_cdata);
+#endif
       GeomPrimitive::CDWriter fresh_cdata(((GeomPrimitive *)_object.p())->_cycler, 
                                           false, _current_thread);
+      ((GeomPrimitivePipelineReader *)this)->_cdata = fresh_cdata;
+#ifdef DO_PIPELINING
+      _cdata->ref();
+#endif
+
       if (!fresh_cdata->_got_minmax) {
         // The cache is still stale.  We have to do the work of
         // freshening it.
@@ -1992,14 +2000,9 @@ check_minmax() const {
         nassertv(fresh_cdata->_got_minmax);
       }
 
-      // Save the new pointer, and then let the lock release itself.
-#ifdef DO_PIPELINING
-      unref_delete((CycleData *)_cdata);
-#endif
-      ((GeomPrimitivePipelineReader *)this)->_cdata = fresh_cdata;
-#ifdef DO_PIPELINING
-      _cdata->ref();
-#endif
+      // When fresh_cdata goes out of scope, its write lock is
+      // released, and _cdata reverts to our usual convention of an
+      // unlocked copy of the data.
     }
   }
 
diff --git a/panda/src/gobj/geomVertexData.cxx b/panda/src/gobj/geomVertexData.cxx
index 8aebc5321b..d1001a3abf 100644
--- a/panda/src/gobj/geomVertexData.cxx
+++ b/panda/src/gobj/geomVertexData.cxx
@@ -1012,8 +1012,22 @@ reverse_normals() const {
 ////////////////////////////////////////////////////////////////////
 CPT(GeomVertexData) GeomVertexData::
 animate_vertices(bool force, Thread *current_thread) const {
-  CDLockedReader cdata(_cycler, current_thread);
+#ifdef DO_PIPELINING
+  {
+    // In the pipelining case, we take a simple short-route
+    // optimization: if the vdata isn't animated, we don't need to
+    // grab any mutex first.
+    CDReader cdata(_cycler, current_thread);
+    if (cdata->_format->get_animation().get_animation_type() != AT_panda) {
+      return this;
+    }
+  }
+#endif  // DO_PIPELINING
 
+  // Now that we've short-circuited the short route, we reasonably
+  // believe the vdata is animated.  Grab the mutex and make sure it's
+  // still animated after we've acquired it.
+  CDLockedReader cdata(_cycler, current_thread);
   if (cdata->_format->get_animation().get_animation_type() != AT_panda) {
     return this;
   }
diff --git a/panda/src/gobj/geomVertexWriter.I b/panda/src/gobj/geomVertexWriter.I
index 7f8c8732ba..6628df1214 100644
--- a/panda/src/gobj/geomVertexWriter.I
+++ b/panda/src/gobj/geomVertexWriter.I
@@ -903,6 +903,7 @@ inc_add_pointer() {
     if (_vertex_data != (GeomVertexData *)NULL) {
       // If we have a whole GeomVertexData, we must set the length of
       // all its arrays at once.
+      _handle = NULL;
       GeomVertexDataPipelineWriter writer(_vertex_data, true, _current_thread);
       writer.check_array_writers();
       writer.set_num_rows(max(write_row + 1, writer.get_num_rows()));
diff --git a/panda/src/gobj/geomVertexWriter.h b/panda/src/gobj/geomVertexWriter.h
index 66d246baf3..eed9c64722 100644
--- a/panda/src/gobj/geomVertexWriter.h
+++ b/panda/src/gobj/geomVertexWriter.h
@@ -173,7 +173,7 @@ private:
   PT(GeomVertexData) _vertex_data;
   int _array;
   PT(GeomVertexArrayData) _array_data;
-
+    
   Thread *_current_thread;
   GeomVertexColumn::Packer *_packer;
   int _stride;
diff --git a/panda/src/pgraph/cullableObject.cxx b/panda/src/pgraph/cullableObject.cxx
index c44b7f3056..35f9278e18 100644
--- a/panda/src/pgraph/cullableObject.cxx
+++ b/panda/src/pgraph/cullableObject.cxx
@@ -68,7 +68,7 @@ munge_geom(GraphicsStateGuardianBase *gsg,
     {
       GeomPipelineReader geom_reader(_geom, current_thread);
       _munged_data = geom_reader.get_vertex_data();
-      
+
 #ifdef _DEBUG
       {
         GeomVertexDataPipelineReader data_reader(_munged_data, current_thread);
diff --git a/panda/src/pgraph/renderAttrib.cxx b/panda/src/pgraph/renderAttrib.cxx
index 12ea43ec49..763543a442 100644
--- a/panda/src/pgraph/renderAttrib.cxx
+++ b/panda/src/pgraph/renderAttrib.cxx
@@ -264,6 +264,8 @@ garbage_collect() {
   PStatTimer timer(_garbage_collect_pcollector);
   int orig_size = _attribs->get_num_entries();
 
+  nassertr(_attribs->validate(), 0);
+
   // How many elements to process this pass?
   int size = _attribs->get_size();
   int num_this_pass = int(size * garbage_collect_states_rate);
@@ -294,6 +296,7 @@ garbage_collect() {
     si = (si + 1) % size;
   } while (si != stop_at_element);
   _garbage_index = si;
+  nassertr(_attribs->validate(), 0);
 
   int new_size = _attribs->get_num_entries();
   return orig_size - new_size;
@@ -314,28 +317,41 @@ validate_attribs() {
     return true;
   }
 
+  if (!_attribs->validate()) {
+    pgraph_cat.error()
+      << "RenderAttrib::_attribs cache is invalid!\n";
+
+    int size = _attribs->get_size();
+    for (int si = 0; si < size; ++si) {
+      if (!_attribs->has_element(si)) {
+        continue;
+      }
+      const RenderAttrib *attrib = _attribs->get_key(si);
+      cerr << si << ": " << attrib << "\n";
+      attrib->get_hash();
+      attrib->write(cerr, 2);
+    }
+
+    return false;
+  }    
+
   int size = _attribs->get_size();
   int si = 0;
   while (si < size && !_attribs->has_element(si)) {
     ++si;
   }
   nassertr(si < size, false);
-  nassertr(_attribs->get_key(si)->get_ref_count() > 0, false);
+  nassertr(_attribs->get_key(si)->get_ref_count() >= 0, false);
   int snext = si;
+  ++snext;
   while (snext < size && !_attribs->has_element(snext)) {
     ++snext;
   }
   while (snext < size) {
+    nassertr(_attribs->get_key(snext)->get_ref_count() >= 0, false);
     const RenderAttrib *ssi = _attribs->get_key(si);
     const RenderAttrib *ssnext = _attribs->get_key(snext);
     int c = ssi->compare_to(*ssnext);
-    if (c >= 0) {
-      pgraph_cat.error()
-        << "RenderAttribs out of order!\n";
-      ssi->write(pgraph_cat.error(false), 2);
-      ssnext->write(pgraph_cat.error(false), 2);
-      return false;
-    }
     int ci = ssnext->compare_to(*ssi);
     if ((ci < 0) != (c > 0) ||
         (ci > 0) != (c < 0) ||
@@ -351,10 +367,10 @@ validate_attribs() {
       return false;
     }
     si = snext;
+    ++snext;
     while (snext < size && !_attribs->has_element(snext)) {
       ++snext;
     }
-    nassertr(_attribs->get_key(si)->get_ref_count() > 0, false);
   }
 
   return true;
diff --git a/panda/src/pgraph/renderState.cxx b/panda/src/pgraph/renderState.cxx
index 532fc3b429..9bf518ddb2 100644
--- a/panda/src/pgraph/renderState.cxx
+++ b/panda/src/pgraph/renderState.cxx
@@ -1193,6 +1193,7 @@ garbage_collect() {
     si = (si + 1) % size;
   } while (si != stop_at_element);
   _garbage_index = si;
+  nassertr(_states->validate(), 0);
 
   int new_size = _states->get_num_entries();
   return orig_size - new_size + num_attribs;
@@ -1363,28 +1364,29 @@ validate_states() {
     return true;
   }
 
+  if (!_states->validate()) {
+    pgraph_cat.error()
+      << "RenderState::_states cache is invalid!\n";
+    return false;
+  }    
+
   int size = _states->get_size();
   int si = 0;
   while (si < size && !_states->has_element(si)) {
     ++si;
   }
   nassertr(si < size, false);
-  nassertr(_states->get_key(si)->get_ref_count() > 0, false);
+  nassertr(_states->get_key(si)->get_ref_count() >= 0, false);
   int snext = si;
+  ++snext;
   while (snext < size && !_states->has_element(snext)) {
     ++snext;
   }
   while (snext < size) {
+    nassertr(_states->get_key(snext)->get_ref_count() >= 0, false);
     const RenderState *ssi = _states->get_key(si);
     const RenderState *ssnext = _states->get_key(snext);
     int c = ssi->compare_to(*ssnext);
-    if (c >= 0) {
-      pgraph_cat.error()
-        << "RenderStates out of order!\n";
-      ssi->write(pgraph_cat.error(false), 2);
-      ssnext->write(pgraph_cat.error(false), 2);
-      return false;
-    }
     int ci = ssnext->compare_to(*ssi);
     if ((ci < 0) != (c > 0) ||
         (ci > 0) != (c < 0) ||
@@ -1400,10 +1402,10 @@ validate_states() {
       return false;
     }
     si = snext;
+    ++snext;
     while (snext < size && !_states->has_element(snext)) {
       ++snext;
     }
-    nassertr(_states->get_key(si)->get_ref_count() > 0, false);
   }
 
   return true;
diff --git a/panda/src/pgraph/textureAttrib.cxx b/panda/src/pgraph/textureAttrib.cxx
index 44fe1ddf4e..65a77e7910 100644
--- a/panda/src/pgraph/textureAttrib.cxx
+++ b/panda/src/pgraph/textureAttrib.cxx
@@ -572,6 +572,8 @@ compare_to_impl(const RenderAttrib *other) const {
 ////////////////////////////////////////////////////////////////////
 size_t TextureAttrib::
 get_hash_impl() const {
+  check_sorted();
+
   size_t hash = 0;
   Stages::const_iterator si;
   for (si = _on_stages.begin(); si != _on_stages.end(); ++si) {
diff --git a/panda/src/pgraph/transformState.cxx b/panda/src/pgraph/transformState.cxx
index abc38d24a3..3b81f50c84 100644
--- a/panda/src/pgraph/transformState.cxx
+++ b/panda/src/pgraph/transformState.cxx
@@ -1271,6 +1271,7 @@ garbage_collect() {
     si = (si + 1) % size;
   } while (si != stop_at_element);
   _garbage_index = si;
+  nassertr(_states->validate(), 0);
 
   int new_size = _states->get_num_entries();
   return orig_size - new_size;
@@ -1419,28 +1420,29 @@ validate_states() {
     return true;
   }
 
+  if (!_states->validate()) {
+    pgraph_cat.error()
+      << "TransformState::_states cache is invalid!\n";
+    return false;
+  }    
+
   int size = _states->get_size();
   int si = 0;
   while (si < size && !_states->has_element(si)) {
     ++si;
   }
   nassertr(si < size, false);
-  nassertr(_states->get_key(si)->get_ref_count() > 0, false);
+  nassertr(_states->get_key(si)->get_ref_count() >= 0, false);
   int snext = si;
+  ++snext;
   while (snext < size && !_states->has_element(snext)) {
     ++snext;
   }
   while (snext < size) {
+    nassertr(_states->get_key(snext)->get_ref_count() >= 0, false);
     const TransformState *ssi = _states->get_key(si);
     const TransformState *ssnext = _states->get_key(snext);
     int c = ssi->compare_to(*ssnext);
-    if (c >= 0) {
-      pgraph_cat.error()
-        << "TransformStates out of order!\n";
-      ssi->write(pgraph_cat.error(false), 2);
-      ssnext->write(pgraph_cat.error(false), 2);
-      return false;
-    }
     int ci = ssnext->compare_to(*ssi);
     if ((ci < 0) != (c > 0) ||
         (ci > 0) != (c < 0) ||
@@ -1456,10 +1458,10 @@ validate_states() {
       return false;
     }
     si = snext;
+    ++snext;
     while (snext < size && !_states->has_element(snext)) {
       ++snext;
     }
-    nassertr(_states->get_key(si)->get_ref_count() > 0, false);
   }
 
   return true;
@@ -2339,7 +2341,11 @@ do_calc_hash() {
       // Otherwise, hash the matrix . . .
       if (uniquify_matrix) {
         // . . . but only if the user thinks that's worthwhile.
-        _hash = get_mat().add_hash(_hash);
+        if ((_flags & F_mat_known) == 0) {
+          // Calculate the matrix without doubly-locking.
+          do_calc_mat();
+        }
+        _hash = _mat.add_hash(_hash);
 
       } else {
         // Otherwise, hash the pointer only--any two different
diff --git a/panda/src/pipeline/cycleDataLockedReader.h b/panda/src/pipeline/cycleDataLockedReader.h
index 5fcdfbc197..2703ee5724 100644
--- a/panda/src/pipeline/cycleDataLockedReader.h
+++ b/panda/src/pipeline/cycleDataLockedReader.h
@@ -41,7 +41,7 @@
 //               It exists as a syntactic convenience to access the
 //               data in the CycleData.  It also allows the whole
 //               system to compile down to nothing if
-//               SUPPORT_PIPELINING is not defined.
+//               DO_PIPELINING is not defined.
 ////////////////////////////////////////////////////////////////////
 template<class CycleDataType>
 class CycleDataLockedReader {
diff --git a/panda/src/pipeline/cycleDataReader.h b/panda/src/pipeline/cycleDataReader.h
index f2107e77e5..bff69b22fe 100644
--- a/panda/src/pipeline/cycleDataReader.h
+++ b/panda/src/pipeline/cycleDataReader.h
@@ -35,7 +35,7 @@
 //               It exists as a syntactic convenience to access the
 //               data in the CycleData.  It also allows the whole
 //               system to compile down to nothing if
-//               SUPPORT_PIPELINING is not defined.
+//               DO_PIPELINING is not defined.
 ////////////////////////////////////////////////////////////////////
 template<class CycleDataType>
 class CycleDataReader {
diff --git a/panda/src/pipeline/cycleDataWriter.h b/panda/src/pipeline/cycleDataWriter.h
index 1e273d7208..5cfda7e2e9 100644
--- a/panda/src/pipeline/cycleDataWriter.h
+++ b/panda/src/pipeline/cycleDataWriter.h
@@ -32,7 +32,7 @@
 //               It exists as a syntactic convenience to access the
 //               data in the CycleData.  It also allows the whole
 //               system to compile down to nothing if
-//               SUPPORT_PIPELINING is not defined.
+//               DO_PIPELINING is not defined.
 ////////////////////////////////////////////////////////////////////
 template<class CycleDataType>
 class CycleDataWriter {
diff --git a/panda/src/pipeline/pipeline.cxx b/panda/src/pipeline/pipeline.cxx
index ebe81e78b3..f349d7d791 100644
--- a/panda/src/pipeline/pipeline.cxx
+++ b/panda/src/pipeline/pipeline.cxx
@@ -70,6 +70,11 @@ Pipeline::
 void Pipeline::
 cycle() {
 #ifdef THREADED_PIPELINE
+  if (pipeline_cat.is_debug()) {
+    pipeline_cat.debug()
+      << "Beginning the pipeline cycle\n";
+  }
+
   pvector< PT(CycleData) > saved_cdatas;
   saved_cdatas.reserve(_dirty_cyclers.size());
   {
@@ -152,8 +157,14 @@ cycle() {
 
   // And now it's safe to let the CycleData pointers in saved_cdatas
   // destruct, which may cause cascading deletes, and which will in
-  // turn case PipelineCyclers to remove themselves from (or add
+  // turn cause PipelineCyclers to remove themselves from (or add
   // themselves to) the _dirty_cyclers list.
+  saved_cdatas.clear();
+
+  if (pipeline_cat.is_debug()) {
+    pipeline_cat.debug()
+      << "Finished the pipeline cycle\n";
+  }
 
 #endif  // THREADED_PIPELINE
 }
diff --git a/panda/src/pipeline/pipelineCycler.h b/panda/src/pipeline/pipelineCycler.h
index b6cf905ed9..26d84a2157 100644
--- a/panda/src/pipeline/pipelineCycler.h
+++ b/panda/src/pipeline/pipelineCycler.h
@@ -40,11 +40,9 @@
 //               CycleDataWriter classes transparently handle this.
 //
 //               If pipelining support is not enabled at compile time
-//               (that is, SUPPORT_PIPELINING is not defined), this
-//               object compiles to a minimum object that presents the
-//               same interface but with minimal runtime overhead.
-//               (Actually, this isn't true yet, but it will be one
-//               day.)
+//               (that is, DO_PIPELINING is not defined), this object
+//               compiles to a minimum object that presents the same
+//               interface but with minimal runtime overhead.
 //
 //               We define this as a struct instead of a class to
 //               guarantee byte placement within the object, so that
diff --git a/panda/src/pipeline/pipelineCyclerTrueImpl.I b/panda/src/pipeline/pipelineCyclerTrueImpl.I
index 5e69cfd81a..95fbc2a68b 100644
--- a/panda/src/pipeline/pipelineCyclerTrueImpl.I
+++ b/panda/src/pipeline/pipelineCyclerTrueImpl.I
@@ -72,7 +72,7 @@ read_unlocked(Thread *current_thread) const {
 #ifdef _DEBUG
   nassertr(pipeline_stage >= 0 && pipeline_stage < _num_stages, NULL);
 #endif
-  return _data[pipeline_stage];
+  return _data[pipeline_stage]._cdata;
 }
 
 ////////////////////////////////////////////////////////////////////
@@ -94,7 +94,7 @@ read(Thread *current_thread) const {
   nassertr(pipeline_stage >= 0 && pipeline_stage < _num_stages, NULL);
 #endif
   _lock.acquire(current_thread);
-  return _data[pipeline_stage];
+  return _data[pipeline_stage]._cdata;
 }
 
 ////////////////////////////////////////////////////////////////////
@@ -110,7 +110,7 @@ increment_read(const CycleData *pointer) const {
 #ifdef _DEBUG
   int pipeline_stage = Thread::get_current_pipeline_stage();
   nassertv(pipeline_stage >= 0 && pipeline_stage < _num_stages);
-  nassertv(_data[pipeline_stage] == pointer);
+  nassertv(_data[pipeline_stage]._cdata == pointer);
 #endif
   _lock.elevate_lock();
 }
@@ -127,7 +127,7 @@ release_read(const CycleData *pointer) const {
 #ifdef _DEBUG
   int pipeline_stage = Thread::get_current_pipeline_stage();
   nassertv(pipeline_stage >= 0 && pipeline_stage < _num_stages);
-  nassertv(_data[pipeline_stage] == pointer);
+  nassertv(_data[pipeline_stage]._cdata == pointer);
 #endif
   _lock.release();
 }
@@ -205,7 +205,7 @@ elevate_read(const CycleData *pointer, Thread *current_thread) {
 #ifdef _DEBUG
   int pipeline_stage = current_thread->get_pipeline_stage();
   nassertr(pipeline_stage >= 0 && pipeline_stage < _num_stages, NULL);
-  nassertr(_data[pipeline_stage] == pointer, NULL);
+  nassertr(_data[pipeline_stage]._cdata == pointer, NULL);
 #endif
   CycleData *new_pointer = write(current_thread);
   _lock.release();
@@ -226,7 +226,7 @@ elevate_read_upstream(const CycleData *pointer, bool force_to_0, Thread *current
 #ifdef _DEBUG
   int pipeline_stage = current_thread->get_pipeline_stage();
   nassertr(pipeline_stage >= 0 && pipeline_stage < _num_stages, NULL);
-  nassertr(_data[pipeline_stage] == pointer, NULL);
+  nassertr(_data[pipeline_stage]._cdata == pointer, NULL);
 #endif
   CycleData *new_pointer = write_upstream(force_to_0, current_thread);
   _lock.release();
@@ -246,8 +246,9 @@ increment_write(CycleData *pointer) const {
 #ifdef _DEBUG
   int pipeline_stage = Thread::get_current_pipeline_stage();
   nassertv(pipeline_stage >= 0 && pipeline_stage < _num_stages);
-  nassertv(_data[pipeline_stage] == pointer);
+  nassertv(_data[pipeline_stage]._cdata == pointer);
 #endif
+  ++(_data[pipeline_stage]._writes_outstanding);
   _lock.elevate_lock();
 }
 
@@ -260,12 +261,8 @@ increment_write(CycleData *pointer) const {
 INLINE void PipelineCyclerTrueImpl::
 release_write(CycleData *pointer) {
   TAU_PROFILE("void PipelineCyclerTrueImpl::release_write(CycleData *)", " ", TAU_USER);
-#ifdef _DEBUG
   int pipeline_stage = Thread::get_current_pipeline_stage();
   return release_write_stage(pipeline_stage, pointer);
-#else
-  _lock.release();
-#endif
 }
 
 ////////////////////////////////////////////////////////////////////
@@ -292,7 +289,7 @@ read_stage_unlocked(int pipeline_stage) const {
 #ifdef _DEBUG
   nassertr(pipeline_stage >= 0 && pipeline_stage < _num_stages, NULL);
 #endif
-  return _data[pipeline_stage];
+  return _data[pipeline_stage]._cdata;
 }
 
 ////////////////////////////////////////////////////////////////////
@@ -313,7 +310,7 @@ read_stage(int pipeline_stage, Thread *current_thread) const {
   nassertr(pipeline_stage >= 0 && pipeline_stage < _num_stages, NULL);
 #endif
   _lock.acquire(current_thread);
-  return _data[pipeline_stage];
+  return _data[pipeline_stage]._cdata;
 }
 
 ////////////////////////////////////////////////////////////////////
@@ -327,7 +324,7 @@ release_read_stage(int pipeline_stage, const CycleData *pointer) const {
   TAU_PROFILE("void PipelineCyclerTrueImpl::release_read_stage(int, const CycleData *)", " ", TAU_USER);
 #ifdef _DEBUG
   nassertv(pipeline_stage >= 0 && pipeline_stage < _num_stages);
-  nassertv(_data[pipeline_stage] == pointer);
+  nassertv(_data[pipeline_stage]._cdata == pointer);
 #endif
   _lock.release();
 }
@@ -347,7 +344,7 @@ elevate_read_stage(int pipeline_stage, const CycleData *pointer,
   TAU_PROFILE("CycleData *PipelineCyclerTrueImpl::elevate_read_stage(int, const CycleData *)", " ", TAU_USER);
 #ifdef _DEBUG
   nassertr(pipeline_stage >= 0 && pipeline_stage < _num_stages, NULL);
-  nassertr(_data[pipeline_stage] == pointer, NULL);
+  nassertr(_data[pipeline_stage]._cdata == pointer, NULL);
 #endif
   CycleData *new_pointer = write_stage(pipeline_stage, current_thread);
   _lock.release();
@@ -369,7 +366,7 @@ elevate_read_stage_upstream(int pipeline_stage, const CycleData *pointer,
   TAU_PROFILE("CycleData *PipelineCyclerTrueImpl::elevate_read_stage(int, const CycleData *)", " ", TAU_USER);
 #ifdef _DEBUG
   nassertr(pipeline_stage >= 0 && pipeline_stage < _num_stages, NULL);
-  nassertr(_data[pipeline_stage] == pointer, NULL);
+  nassertr(_data[pipeline_stage]._cdata == pointer, NULL);
 #endif
   CycleData *new_pointer = 
     write_stage_upstream(pipeline_stage, force_to_0, current_thread);
@@ -388,8 +385,10 @@ release_write_stage(int pipeline_stage, CycleData *pointer) {
   TAU_PROFILE("void PipelineCyclerTrueImpl::release_write_stage(int, const CycleData *)", " ", TAU_USER);
 #ifdef _DEBUG
   nassertv(pipeline_stage >= 0 && pipeline_stage < _num_stages);
-  nassertv(_data[pipeline_stage] == pointer);
+  nassertv(_data[pipeline_stage]._cdata == pointer);
+  nassertv(_data[pipeline_stage]._writes_outstanding > 0);
 #endif
+  --(_data[pipeline_stage]._writes_outstanding);
   _lock.release();
 }
 
@@ -401,7 +400,7 @@ release_write_stage(int pipeline_stage, CycleData *pointer) {
 ////////////////////////////////////////////////////////////////////
 INLINE TypeHandle PipelineCyclerTrueImpl::
 get_parent_type() const {
-  return _data[0]->get_parent_type();
+  return _data[0]._cdata->get_parent_type();
 }
 
 ////////////////////////////////////////////////////////////////////
@@ -419,7 +418,7 @@ cheat() const {
   TAU_PROFILE("CycleData *PipelineCyclerTrueImpl::cheat()", " ", TAU_USER);
   int pipeline_stage = Thread::get_current_pipeline_stage();
   nassertr(pipeline_stage >= 0 && pipeline_stage < _num_stages, NULL);
-  return _data[pipeline_stage];
+  return _data[pipeline_stage]._cdata;
 }
 
 ////////////////////////////////////////////////////////////////////
@@ -458,12 +457,13 @@ get_write_count() const {
 INLINE PT(CycleData) PipelineCyclerTrueImpl::
 cycle_2() {
   TAU_PROFILE("PT(CycleData) PipelineCyclerTrueImpl::cycle_2()", " ", TAU_USER);
-  PT(CycleData) last_val = _data[1].p();
+  PT(CycleData) last_val = _data[1]._cdata.p();
   nassertr(_lock.debug_is_locked(), last_val);
   nassertr(_dirty, last_val);
   nassertr(_num_stages == 2, last_val);
 
-  _data[1] = _data[0];
+  nassertr(_data[1]._writes_outstanding == 0, last_val);
+  _data[1]._cdata = _data[0]._cdata;
 
   // No longer dirty.
   _dirty = false;
@@ -482,15 +482,17 @@ cycle_2() {
 INLINE PT(CycleData) PipelineCyclerTrueImpl::
 cycle_3() {
   TAU_PROFILE("PT(CycleData) PipelineCyclerTrueImpl::cycle_3()", " ", TAU_USER);
-  PT(CycleData) last_val = _data[2].p();
+  PT(CycleData) last_val = _data[2]._cdata.p();
   nassertr(_lock.debug_is_locked(), last_val);
   nassertr(_dirty, last_val);
   nassertr(_num_stages == 3, last_val);
 
-  _data[2] = _data[1];
-  _data[1] = _data[0];
+  nassertr(_data[2]._writes_outstanding == 0, last_val);
+  nassertr(_data[1]._writes_outstanding == 0, last_val);
+  _data[2]._cdata = _data[1]._cdata;
+  _data[1]._cdata = _data[0]._cdata;
 
-  if (_data[2] == _data[1]) {
+  if (_data[2]._cdata == _data[1]._cdata) {
     // No longer dirty.
     _dirty = false;
   }
@@ -509,3 +511,46 @@ CyclerMutex(PipelineCyclerTrueImpl *cycler) {
   _cycler = cycler;
 #endif
 }
+
+////////////////////////////////////////////////////////////////////
+//     Function: PipelineCyclerTrueImpl::CyclerMutex::Constructor
+//       Access: Public
+//  Description: 
+////////////////////////////////////////////////////////////////////
+INLINE PipelineCyclerTrueImpl::CycleDataNode::
+CycleDataNode() :
+  _writes_outstanding(0)
+{
+}
+
+////////////////////////////////////////////////////////////////////
+//     Function: PipelineCyclerTrueImpl::CyclerMutex::Copy Constructor
+//       Access: Public
+//  Description: 
+////////////////////////////////////////////////////////////////////
+INLINE PipelineCyclerTrueImpl::CycleDataNode::
+CycleDataNode(const PipelineCyclerTrueImpl::CycleDataNode &copy) :
+  _cdata(copy._cdata),
+  _writes_outstanding(0)
+{
+}
+
+////////////////////////////////////////////////////////////////////
+//     Function: PipelineCyclerTrueImpl::CyclerMutex::Destructor
+//       Access: Public
+//  Description: 
+////////////////////////////////////////////////////////////////////
+INLINE PipelineCyclerTrueImpl::CycleDataNode::
+~CycleDataNode() {
+  nassertv(_writes_outstanding == 0);
+}
+
+////////////////////////////////////////////////////////////////////
+//     Function: PipelineCyclerTrueImpl::CyclerMutex::Copy Assignment
+//       Access: Public
+//  Description: 
+////////////////////////////////////////////////////////////////////
+INLINE void PipelineCyclerTrueImpl::CycleDataNode::
+operator = (const PipelineCyclerTrueImpl::CycleDataNode &copy) {
+  _cdata = copy._cdata;
+}
diff --git a/panda/src/pipeline/pipelineCyclerTrueImpl.cxx b/panda/src/pipeline/pipelineCyclerTrueImpl.cxx
index 3d16ca8207..f1a8f0852a 100644
--- a/panda/src/pipeline/pipelineCyclerTrueImpl.cxx
+++ b/panda/src/pipeline/pipelineCyclerTrueImpl.cxx
@@ -35,9 +35,9 @@ PipelineCyclerTrueImpl(CycleData *initial_data, Pipeline *pipeline) :
   }
 
   _num_stages = _pipeline->get_num_stages();
-  _data = new NPT(CycleData)[_num_stages];
+  _data = new CycleDataNode[_num_stages];
   for (int i = 0; i < _num_stages; ++i) {
-    _data[i] = initial_data;
+    _data[i]._cdata = initial_data;
   }
 
   _pipeline->add_cycler(this);
@@ -59,7 +59,7 @@ PipelineCyclerTrueImpl(const PipelineCyclerTrueImpl &copy) :
   
   _num_stages = _pipeline->get_num_stages();
   nassertv(_num_stages == copy._num_stages);
-  _data = new NPT(CycleData)[_num_stages];
+  _data = new CycleDataNode[_num_stages];
   
   // It's no longer critically important that we preserve pointerwise
   // equivalence between different stages in the copy, but it doesn't
@@ -69,11 +69,11 @@ PipelineCyclerTrueImpl(const PipelineCyclerTrueImpl &copy) :
   Pointers pointers;
   
   for (int i = 0; i < _num_stages; ++i) {
-    PT(CycleData) &new_pt = pointers[copy._data[i]];
+    PT(CycleData) &new_pt = pointers[copy._data[i]._cdata];
     if (new_pt == NULL) {
-      new_pt = copy._data[i]->make_copy();
+      new_pt = copy._data[i]._cdata->make_copy();
     }
-    _data[i] = new_pt.p();
+    _data[i]._cdata = new_pt.p();
   }
 
   _pipeline->add_cycler(this);
@@ -97,11 +97,11 @@ operator = (const PipelineCyclerTrueImpl &copy) {
   Pointers pointers;
 
   for (int i = 0; i < _num_stages; ++i) {
-    PT(CycleData) &new_pt = pointers[copy._data[i]];
+    PT(CycleData) &new_pt = pointers[copy._data[i]._cdata];
     if (new_pt == NULL) {
-      new_pt = copy._data[i]->make_copy();
+      new_pt = copy._data[i]._cdata->make_copy();
     }
-    _data[i] = new_pt.p();
+    _data[i]._cdata = new_pt.p();
   }
 
   if (copy._dirty && !_dirty) {
@@ -146,24 +146,37 @@ write_stage(int pipeline_stage, Thread *current_thread) {
   }
 #endif  // NDEBUG
 
-  CycleData *old_data = _data[pipeline_stage];
+  CycleData *old_data = _data[pipeline_stage]._cdata;
 
-  // Only the node reference count is considered an important count
-  // for copy-on-write purposes.  A standard reference of other than 1
-  // just means that some code (other that the PipelineCycler) has a
-  // pointer, which is safe to modify.
-  if (old_data->get_node_ref_count() != 1) {
-    // Copy-on-write.
-    _data[pipeline_stage] = old_data->make_copy();
-
-    // Now we have differences between some of the data pointers, so
-    // we're "dirty".  Mark it so.
-    if (!_dirty && _num_stages != 1) {
-      _pipeline->add_dirty_cycler(this);
+  // We only perform copy-on-write if this is the first CycleData
+  // requested for write mode from this thread.  (We will never have
+  // outstanding writes for multiple threads, because we hold the
+  // CyclerMutex during the entire lifetime of write() .. release()).
+  if (_data[pipeline_stage]._writes_outstanding == 0) {
+    // Only the node reference count is considered an important count
+    // for copy-on-write purposes.  A standard reference of other than 1
+    // just means that some code (other that the PipelineCycler) has a
+    // pointer, which is safe to modify.
+    if (old_data->get_node_ref_count() != 1) {
+      // Copy-on-write.
+      _data[pipeline_stage]._cdata = old_data->make_copy();
+      if (pipeline_cat.is_debug()) {
+        pipeline_cat.debug()
+          << "Copy-on-write a: " << old_data << " becomes " 
+          << _data[pipeline_stage]._cdata << "\n";
+        //nassertr(false, NULL);
+      }
+      
+      // Now we have differences between some of the data pointers, so
+      // we're "dirty".  Mark it so.
+      if (!_dirty && _num_stages != 1) {
+        _pipeline->add_dirty_cycler(this);
+      }
     }
   }
 
-  return _data[pipeline_stage];
+  ++(_data[pipeline_stage]._writes_outstanding);
+  return _data[pipeline_stage]._cdata;
 }
 
 ////////////////////////////////////////////////////////////////////
@@ -184,30 +197,42 @@ write_stage_upstream(int pipeline_stage, bool force_to_0, Thread *current_thread
   }
 #endif  // NDEBUG
 
-  CycleData *old_data = _data[pipeline_stage];
+  CycleData *old_data = _data[pipeline_stage]._cdata;
 
   if (old_data->get_ref_count() != 1 || force_to_0) {
     // Count the number of references before the current stage, and
     // the number of references remaining other than those.
     int external_count = old_data->get_ref_count() - 1;
     int k = pipeline_stage - 1;
-    while (k >= 0 && _data[k] == old_data) {
+    while (k >= 0 && _data[k]._cdata == old_data) {
       --k;
       --external_count;
     }
     
-    if (external_count > 0) {
+    // We only perform copy-on-write if this is the first CycleData
+    // requested for write mode from this thread.  (We will never have
+    // outstanding writes for multiple threads, because we hold the
+    // CyclerMutex during the entire lifetime of write()
+    // .. release()).
+    if (external_count > 0 && _data[pipeline_stage]._writes_outstanding == 0) {
       // There are references other than the ones before this stage in
       // the pipeline; perform a copy-on-write.
       PT(CycleData) new_data = old_data->make_copy();
+      if (pipeline_cat.is_debug()) {
+        pipeline_cat.debug()
+          << "Copy-on-write b: " << old_data << " becomes " 
+          << new_data << "\n";
+        //nassertr(false, NULL);
+      }
       
       k = pipeline_stage - 1;
-      while (k >= 0 && (_data[k] == old_data || force_to_0)) {
-        _data[k] = new_data.p();
+      while (k >= 0 && (_data[k]._cdata == old_data || force_to_0)) {
+        nassertr(_data[k]._writes_outstanding == 0, NULL);
+        _data[k]._cdata = new_data.p();
         --k;
       }
       
-      _data[pipeline_stage] = new_data;
+      _data[pipeline_stage]._cdata = new_data;
       
       if (k >= 0 || pipeline_stage + 1 < _num_stages) {
         // Now we have differences between some of the data pointers,
@@ -216,21 +241,23 @@ write_stage_upstream(int pipeline_stage, bool force_to_0, Thread *current_thread
           _pipeline->add_dirty_cycler(this);
         }
       }
-
+      
     } else if (k >= 0 && force_to_0) {
       // There are no external pointers, so no need to copy-on-write,
       // but the current pointer doesn't go all the way back.  Make it
       // do so.
       while (k >= 0) {
-        _data[k] = old_data;
+        nassertr(_data[k]._writes_outstanding == 0, NULL);
+        _data[k]._cdata = old_data;
         --k;
       }
     }
   }
-
-  return _data[pipeline_stage];
+    
+  ++(_data[pipeline_stage]._writes_outstanding);
+  return _data[pipeline_stage]._cdata;
 }
-
+  
 ////////////////////////////////////////////////////////////////////
 //     Function: PipelineCyclerTrueImpl::cycle
 //       Access: Private
@@ -251,17 +278,18 @@ write_stage_upstream(int pipeline_stage, bool force_to_0, Thread *current_thread
 ////////////////////////////////////////////////////////////////////
 PT(CycleData) PipelineCyclerTrueImpl::
 cycle() {
-  PT(CycleData) last_val = _data[_num_stages - 1].p();
+  PT(CycleData) last_val = _data[_num_stages - 1]._cdata.p();
   nassertr(_lock.debug_is_locked(), last_val);
   nassertr(_dirty, last_val);
 
   int i;
   for (i = _num_stages - 1; i > 0; --i) {
-    _data[i] = _data[i - 1];
+    nassertr(_data[i]._writes_outstanding == 0, last_val);
+    _data[i]._cdata = _data[i - 1]._cdata;
   }
 
   for (i = 1; i < _num_stages; ++i) {
-    if (_data[i] != _data[i - 1]) {
+    if (_data[i]._cdata != _data[i - 1]._cdata) {
       // Still dirty.
       return last_val;
     }
@@ -286,7 +314,8 @@ set_num_stages(int num_stages) {
     // Don't bother to reallocate the array smaller; we just won't use
     // the rest of the array.
     for (int i = _num_stages; i < num_stages; ++i) {
-      _data[i].clear();
+      nassertv(_data[i]._writes_outstanding == 0);
+      _data[i]._cdata.clear();
     }
 
     _num_stages = num_stages;
@@ -294,13 +323,14 @@ set_num_stages(int num_stages) {
 
   } else {
     // To increase the array, we must reallocate it larger.
-    NPT(CycleData) *new_data = new NPT(CycleData)[num_stages];
+    CycleDataNode *new_data = new CycleDataNode[num_stages];
     int i;
     for (i = 0; i < _num_stages; ++i) {
-      new_data[i] = _data[i];
+      nassertv(_data[i]._writes_outstanding == 0);
+      new_data[i]._cdata = _data[i]._cdata;
     }
     for (i = _num_stages; i < num_stages; ++i) {
-      new_data[i] = _data[_num_stages - 1];
+      new_data[i]._cdata = _data[_num_stages - 1]._cdata;
     }
     delete[] _data;
 
diff --git a/panda/src/pipeline/pipelineCyclerTrueImpl.h b/panda/src/pipeline/pipelineCyclerTrueImpl.h
index 9227e553bb..75bb422b4f 100644
--- a/panda/src/pipeline/pipelineCyclerTrueImpl.h
+++ b/panda/src/pipeline/pipelineCyclerTrueImpl.h
@@ -113,8 +113,19 @@ private:
 private:
   Pipeline *_pipeline;
 
-  // An array of NPT(CycleData) objects.
-  NPT(CycleData) *_data;
+  // An array of PT(CycleData) objects representing the different
+  // copies of the cycled data, one for each stage.
+  class CycleDataNode : public MemoryBase {
+  public:
+    INLINE CycleDataNode();
+    INLINE CycleDataNode(const CycleDataNode &copy);
+    INLINE ~CycleDataNode();
+    INLINE void operator = (const CycleDataNode &copy);
+
+    NPT(CycleData) _cdata;
+    int _writes_outstanding;
+  };
+  CycleDataNode *_data;
   int _num_stages;
   bool _dirty;
 
diff --git a/panda/src/putil/simpleHashMap.I b/panda/src/putil/simpleHashMap.I
index 1ae33ada2a..8670600a7b 100644
--- a/panda/src/putil/simpleHashMap.I
+++ b/panda/src/putil/simpleHashMap.I
@@ -123,20 +123,32 @@ store(const Key &key, const Value &data) {
     size_t index = get_hash(key);
     store_new_element(index, key, data);
     ++_num_entries;
+#ifdef _DEBUG
+    nassertr(validate(), index);
+#endif
     return index;
   }
 
   size_t index = get_hash(key);
   if (!has_element(index)) {
+    // This element is not already in the map; add it.
     if (consider_expand_table()) {
       return store(key, data);
     }
     store_new_element(index, key, data);
     ++_num_entries;
+#ifdef _DEBUG
+    nassertr(validate(), index);
+#endif
     return index;
   }
   if (is_element(index, key)) {
+    // This element is already in the map; replace the data at that
+    // key.
     _table[index]._data = data;
+#ifdef _DEBUG
+    nassertr(validate(), index);
+#endif
     return index;
   }
 
@@ -151,10 +163,16 @@ store(const Key &key, const Value &data) {
       }
       store_new_element(i, key, data);
       ++_num_entries;
+#ifdef _DEBUG
+      nassertr(validate(), i);
+#endif
       return i;
     }
     if (is_element(i, key)) {
       _table[i]._data = data;
+#ifdef _DEBUG
+      nassertr(validate(), i);
+#endif
       return i;
     }
     i = (i + 1) & (_table_size - 1);
@@ -363,6 +381,10 @@ remove_element(int n) {
     // conflicts.
     i = (i + 1) & (_table_size - 1);
   }
+
+#ifdef _DEBUG
+  nassertv(validate());
+#endif
 }
 
 ////////////////////////////////////////////////////////////////////
@@ -605,13 +627,19 @@ expand_table() {
   _deleted_chain = memory_hook->get_deleted_chain(alloc_size);
   _table = (TableEntry *)_deleted_chain->allocate(alloc_size, TypeHandle::none());
   memset(get_exists_array(), 0, _table_size);
+  nassertv(_num_entries == 0);
 
   // Now copy the entries from the old table into the new table.
+  int num_added = 0;
   for (size_t i = 0; i < old_map._table_size; ++i) {
     if (old_map.has_element(i)) {
       store(old_map._table[i]._key, old_map._table[i]._data);
+      ++num_added;
     }
   }
 
-  nassertv(old_map._num_entries == _num_entries);
+  nassertv(validate());
+  nassertv(old_map.validate());
+
+  nassertv(_num_entries == old_map._num_entries);
 }
diff --git a/panda/src/testbed/pgrid.cxx b/panda/src/testbed/pgrid.cxx
index e14e89bdb9..e49123666d 100644
--- a/panda/src/testbed/pgrid.cxx
+++ b/panda/src/testbed/pgrid.cxx
@@ -215,7 +215,7 @@ load_gridded_models(WindowFramework *window,
 
   Loader loader;
   LoaderOptions options;
-  options.set_flags(options.get_flags() | LoaderOptions::LF_no_ram_cache);
+  //  options.set_flags(options.get_flags() | LoaderOptions::LF_no_ram_cache);
 
   // First, load up each model from disk once, and store them all
   // separate from the scene graph.  Also count up the total number of