=== modified file 'sql/log.cc'
+-- sql/log.cc	2013-01-09 22:51:51 +0000
+++ sql/log.cc	2013-04-01 18:56:39 +0000
@@ -5458,6 +5458,453 @@
 
 #ifdef HAVE_MMAP
 
+#define TC_LOG_HEADER_SIZE (sizeof(tc_log_magic)+1)
+
+static const uchar tc_log_magic[]={(uchar) 254, 0x23, 0x05, 0x74};
+
+ulong opt_tc_log_size= TC_LOG_MIN_SIZE;
+ulong tc_log_max_pages_used=0, tc_log_page_size=0, tc_log_cur_pages_used=0;
+
+int TC_LOG_MMAP::open(const char *opt_name)
+{
+  uint i;
+  bool crashed=FALSE;
+  PAGE *pg;
+
+  DBUG_ASSERT(total_ha_2pc > 1);
+  DBUG_ASSERT(opt_name && opt_name[0]);
+
+  tc_log_page_size= my_getpagesize();
+  DBUG_ASSERT(TC_LOG_PAGE_SIZE % tc_log_page_size == 0);
+
+  fn_format(logname,opt_name,mysql_data_home,"",MY_UNPACK_FILENAME);
+  if ((fd= my_open(logname, O_RDWR, MYF(0))) < 0)
+  {
+    if (my_errno != ENOENT)
+      goto err;
+    if (using_heuristic_recover())
+      return 1;
+    if ((fd= my_create(logname, CREATE_MODE, O_RDWR, MYF(MY_WME))) < 0)
+      goto err;
+    inited=1;
+    file_length= opt_tc_log_size;
+    if (my_chsize(fd, file_length, 0, MYF(MY_WME)))
+      goto err;
+  }
+  else
+  {
+    inited= 1;
+    crashed= TRUE;
+    sql_print_information("Recovering after a crash using %s", opt_name);
+    if (tc_heuristic_recover)
+    {
+      sql_print_error("Cannot perform automatic crash recovery when "
+                      "--tc-heuristic-recover is used");
+      goto err;
+    }
+    file_length= my_seek(fd, 0L, MY_SEEK_END, MYF(MY_WME+MY_FAE));
+    if (file_length == MY_FILEPOS_ERROR || file_length % tc_log_page_size)
+      goto err;
+  }
+
+  data= (uchar *)my_mmap(0, (size_t)file_length, PROT_READ|PROT_WRITE,
+                        MAP_NOSYNC|MAP_SHARED, fd, 0);
+  if (data == MAP_FAILED)
+  {
+    my_errno=errno;
+    goto err;
+  }
+  inited=2;
+
+  npages=(uint)file_length/tc_log_page_size;
+  if (npages < 3)             // to guarantee non-empty pool
+    goto err;
+  if (!(pages=(PAGE *)my_malloc(npages*sizeof(PAGE), MYF(MY_WME|MY_ZEROFILL))))
+    goto err;
+  inited=3;
+  for (pg=pages, i=0; i < npages; i++, pg++)
+  {
+    pg->next=pg+1;
+    pg->waiters=0;
+    pg->state=POOL;
+    pthread_mutex_init(&pg->lock, MY_MUTEX_INIT_FAST);
+    pthread_cond_init (&pg->cond, 0);
+    pg->ptr= pg->start=(my_xid *)(data + i*tc_log_page_size);
+    pg->size=pg->free=tc_log_page_size/sizeof(my_xid);
+    pg->end=pg->start + pg->size;
+  }
+  pages[0].size=pages[0].free=
+                (tc_log_page_size-TC_LOG_HEADER_SIZE)/sizeof(my_xid);
+  pages[0].start=pages[0].end-pages[0].size;
+  pages[npages-1].next=0;
+  inited=4;
+
+  if (crashed && recover())
+      goto err;
+
+  memcpy(data, tc_log_magic, sizeof(tc_log_magic));
+  data[sizeof(tc_log_magic)]= (uchar)total_ha_2pc;
+  my_msync(fd, data, tc_log_page_size, MS_SYNC);
+  inited=5;
+
+  pthread_mutex_init(&LOCK_sync,    MY_MUTEX_INIT_FAST);
+  pthread_mutex_init(&LOCK_active,  MY_MUTEX_INIT_FAST);
+  pthread_mutex_init(&LOCK_pool,    MY_MUTEX_INIT_FAST);
+  pthread_cond_init(&COND_active, 0);
+  pthread_cond_init(&COND_pool, 0);
+
+  inited=6;
+
+  syncing= 0;
+  active=pages;
+  DBUG_ASSERT(npages >= 2);
+  pool=pages+1;
+  pool_last_ptr= &((pages+npages-1)->next);
+
+  return 0;
+
+err:
+  close();
+  return 1;
+}
+
+/**
+  there is no active page, let's got one from the pool.
+
+  Two strategies here:
+    -# take the first from the pool
+    -# if there're waiters - take the one with the most free space.
+
+  @todo
+    page merging. try to allocate adjacent page first,
+    so that they can be flushed both in one sync
+*/
+
+void TC_LOG_MMAP::get_active_from_pool()
+{
+  PAGE **p, **best_p=0;
+  int best_free;
+
+  pthread_mutex_lock(&LOCK_pool);
+
+  do
+  {
+    best_p= p= &pool;
+    if ((*p)->waiters == 0 && (*p)->free > 0) // can the first page be used ?
+      break;                                  // yes - take it.
+
+    best_free=0;            // no - trying second strategy
+    for (p=&(*p)->next; *p; p=&(*p)->next)
+    {
+      if ((*p)->waiters == 0 && (*p)->free > best_free)
+      {
+        best_free=(*p)->free;
+        best_p=p;
+      }
+    }
+  }
+  while ((*best_p == 0 || best_free == 0) && overflow());
+
+  safe_mutex_assert_owner(&LOCK_active);
+  active=*best_p;
+
+  /* Unlink the page from the pool. */
+  if (!(*best_p)->next)
+    pool_last_ptr= best_p;
+  *best_p=(*best_p)->next;
+  pthread_mutex_unlock(&LOCK_pool);
+
+  pthread_mutex_lock(&active->lock);
+  if (active->free == active->size) // we've chosen an empty page
+  {
+    tc_log_cur_pages_used++;
+    set_if_bigger(tc_log_max_pages_used, tc_log_cur_pages_used);
+  }
+}
+
+/**
+  @todo
+  perhaps, increase log size ?
+*/
+int TC_LOG_MMAP::overflow()
+{
+  /*
+    simple overflow handling - just wait
+    TODO perhaps, increase log size ?
+    let's check the behaviour of tc_log_page_waits first
+  */
+  tc_log_page_waits++;
+  pthread_cond_wait(&COND_pool, &LOCK_pool);
+  return 1; // always return 1
+}
+
+/**
+  Record that transaction XID is committed on the persistent storage.
+
+    This function is called in the middle of two-phase commit:
+    First all resources prepare the transaction, then tc_log->log() is called,
+    then all resources commit the transaction, then tc_log->unlog() is called.
+
+    All access to active page is serialized but it's not a problem, as
+    we're assuming that fsync() will be a main bottleneck.
+    That is, parallelizing writes to log pages we'll decrease number of
+    threads waiting for a page, but then all these threads will be waiting
+    for a fsync() anyway
+
+   If tc_log == MYSQL_LOG then tc_log writes transaction to binlog and
+   records XID in a special Xid_log_event.
+   If tc_log = TC_LOG_MMAP then xid is written in a special memory-mapped
+   log.
+
+  @retval
+    0  - error
+  @retval
+    \# - otherwise, "cookie", a number that will be passed as an argument
+    to unlog() call. tc_log can define it any way it wants,
+    and use for whatever purposes. TC_LOG_MMAP sets it
+    to the position in memory where xid was logged to.
+*/
+
+int TC_LOG_MMAP::log_xid(THD *thd, my_xid xid)
+{
+  int err;
+  PAGE *p;
+  ulong cookie;
+
+  pthread_mutex_lock(&LOCK_active);
+
+  /*
+    if the active page is full - just wait...
+    frankly speaking, active->free here accessed outside of mutex
+    protection, but it's safe, because it only means we may miss an
+    unlog() for the active page, and we're not waiting for it here -
+    unlog() does not signal COND_active.
+  */
+  while (unlikely(active && active->free == 0))
+    pthread_cond_wait(&COND_active, &LOCK_active);
+
+  /* no active page ? take one from the pool */
+  if (active == 0)
+    get_active_from_pool();
+  else
+    pthread_mutex_lock(&active->lock);
+
+  p=active;
+
+  /*
+    p->free is always > 0 here because to decrease it one needs
+    to take p->lock and before it one needs to take LOCK_active.
+    But checked that active->free > 0 under LOCK_active and
+    haven't release it ever since
+  */
+
+  /* searching for an empty slot */
+  while (*p->ptr)
+  {
+    p->ptr++;
+    DBUG_ASSERT(p->ptr < p->end);               // because p->free > 0
+  }
+
+  /* found! store xid there and mark the page dirty */
+  cookie= (ulong)((uchar *)p->ptr - data);      // can never be zero
+  *p->ptr++= xid;
+  p->free--;
+  p->state= DIRTY;
+  pthread_mutex_unlock(&p->lock);
+
+  pthread_mutex_lock(&LOCK_sync);
+  if (syncing)
+  {                                          // somebody's syncing. let's wait
+    pthread_mutex_unlock(&LOCK_active);
+    pthread_mutex_lock(&p->lock);
+    p->waiters++;
+    while (p->state == DIRTY && syncing)
+    {
+      pthread_mutex_unlock(&p->lock);
+      pthread_cond_wait(&p->cond, &LOCK_sync);
+      pthread_mutex_lock(&p->lock);
+    }
+    p->waiters--;
+    err= p->state == ERROR;
+    if (p->state != DIRTY)                   // page was synced
+    {
+      pthread_mutex_unlock(&LOCK_sync);
+      if (p->waiters == 0)
+        pthread_cond_signal(&COND_pool);     // in case somebody's waiting
+      pthread_mutex_unlock(&p->lock);
+      goto done;                             // we're done
+    }
+    DBUG_ASSERT(!syncing);
+    pthread_mutex_unlock(&p->lock);
+    syncing = p;
+    pthread_mutex_unlock(&LOCK_sync);
+
+    pthread_mutex_lock(&LOCK_active);
+    active=0;                                  // page is not active anymore
+    pthread_cond_broadcast(&COND_active);
+    pthread_mutex_unlock(&LOCK_active);
+  }
+  else
+  {
+    syncing = p;                               // place is vacant - take it
+    pthread_mutex_unlock(&LOCK_sync);
+    active = 0;                                // page is not active anymore
+    pthread_cond_broadcast(&COND_active);
+    pthread_mutex_unlock(&LOCK_active);
+  }
+  err= sync();
+
+done:
+  return err ? 0 : cookie;
+}
+
+int TC_LOG_MMAP::sync()
+{
+  int err;
+
+  DBUG_ASSERT(syncing != active);
+
+  /*
+    sit down and relax - this can take a while...
+    note - no locks are held at this point
+  */
+  err= my_msync(fd, syncing->start, syncing->size * sizeof(my_xid), MS_SYNC);
+
+  /* page is synced. let's move it to the pool */
+  pthread_mutex_lock(&LOCK_pool);
+  (*pool_last_ptr)=syncing;
+  pool_last_ptr=&(syncing->next);
+  syncing->next=0;
+  syncing->state= err ? ERROR : POOL;
+  pthread_cond_signal(&COND_pool);           // in case somebody's waiting
+  pthread_mutex_unlock(&LOCK_pool);
+
+  /* marking 'syncing' slot free */
+  pthread_mutex_lock(&LOCK_sync);
+  pthread_cond_broadcast(&syncing->cond);    // signal "sync done"
+  syncing=0;
+  /*
+    we check the "active" pointer without LOCK_active. Still, it's safe -
+    "active" can change from NULL to not NULL any time, but it
+    will take LOCK_sync before waiting on active->cond. That is, it can never
+    miss a signal.
+    And "active" can change to NULL only by the syncing thread
+    (the thread that will send a signal below)
+  */
+  if (active)
+    pthread_cond_signal(&active->cond);      // wake up a new syncer
+  pthread_mutex_unlock(&LOCK_sync);
+  return err;
+}
+
+/**
+  erase xid from the page, update page free space counters/pointers.
+  cookie points directly to the memory where xid was logged.
+*/
+
+int TC_LOG_MMAP::unlog(ulong cookie, my_xid xid)
+{
+  PAGE *p=pages+(cookie/tc_log_page_size);
+  my_xid *x=(my_xid *)(data+cookie);
+
+  DBUG_ASSERT(*x == xid);
+  DBUG_ASSERT(x >= p->start && x < p->end);
+
+  pthread_mutex_lock(&p->lock);
+  *x=0;
+  p->free++;
+  DBUG_ASSERT(p->free <= p->size);
+  set_if_smaller(p->ptr, x);
+  if (p->free == p->size)              // the page is completely empty
+    statistic_decrement(tc_log_cur_pages_used, &LOCK_status);
+  if (p->waiters == 0)                 // the page is in pool and ready to rock
+    pthread_cond_signal(&COND_pool);   // ping ... for overflow()
+  pthread_mutex_unlock(&p->lock);
+  return 0;
+}
+
+void TC_LOG_MMAP::close()
+{
+  uint i;
+  switch (inited) {
+  case 6:
+    pthread_mutex_destroy(&LOCK_sync);
+    pthread_mutex_destroy(&LOCK_active);
+    pthread_mutex_destroy(&LOCK_pool);
+    pthread_cond_destroy(&COND_pool);
+  case 5:
+    data[0]='A'; // garble the first (signature) byte, in case my_delete fails
+  case 4:
+    for (i=0; i < npages; i++)
+    {
+      if (pages[i].ptr == 0)
+        break;
+      pthread_mutex_destroy(&pages[i].lock);
+      pthread_cond_destroy(&pages[i].cond);
+    }
+  case 3:
+    my_free((uchar*)pages, MYF(0));
+  case 2:
+    my_munmap((char*)data, (size_t)file_length);
+  case 1:
+    my_close(fd, MYF(0));
+  }
+  if (inited>=5) // cannot do in the switch because of Windows
+    my_delete(logname, MYF(MY_WME));
+  inited=0;
+}
+
+int TC_LOG_MMAP::recover()
+{
+  HASH xids;
+  PAGE *p=pages, *end_p=pages+npages;
+
+  if (bcmp(data, tc_log_magic, sizeof(tc_log_magic)))
+  {
+    sql_print_error("Bad magic header in tc log");
+    goto err1;
+  }
+
+  /*
+    the first byte after magic signature is set to current
+    number of storage engines on startup
+  */
+  if (data[sizeof(tc_log_magic)] != total_ha_2pc)
+  {
+    sql_print_error("Recovery failed! You must enable "
+                    "exactly %d storage engines that support "
+                    "two-phase commit protocol",
+                    data[sizeof(tc_log_magic)]);
+    goto err1;
+  }
+
+  if (hash_init(&xids, &my_charset_bin, tc_log_page_size/3, 0,
+                sizeof(my_xid), 0, 0, MYF(0)))
+    goto err1;
+
+  for ( ; p < end_p ; p++)
+  {
+    for (my_xid *x=p->start; x < p->end; x++)
+      if (*x && my_hash_insert(&xids, (uchar *)x))
+        goto err2; // OOM
+  }
+
+  if (ha_recover(&xids))
+    goto err2;
+
+  hash_free(&xids);
+  bzero(data, (size_t)file_length);
+  return 0;
+
+err2:
+  hash_free(&xids);
+err1:
+  sql_print_error("Crash recovery failed. Either correct the problem "
+                  "(if it's, for example, out of memory error) and restart, "
+                  "or delete tc log and start mysqld with "
+                  "--tc-heuristic-recover={commit|rollback}");
+  return 1;
+}
 #endif
 
 TC_LOG *tc_log;

=== modified file 'sql/log.h'
+-- sql/log.h	2012-11-20 12:28:53 +0000
+++ sql/log.h	2013-04-01 19:08:06 +0000
@@ -55,9 +55,61 @@
 };
 
 #ifdef HAVE_MMAP
+class TC_LOG_MMAP: public TC_LOG
+{
+  public:                // only to keep Sun Forte on sol9x86 happy
+  typedef enum {
+    POOL,                 // page is in pool
+    ERROR,                // last sync failed
+    DIRTY                 // new xids added since last sync
+  } PAGE_STATE;
+
+  private:
+  typedef struct st_page {
+    struct st_page *next; // page a linked in a fifo queue
+    my_xid *start, *end;  // usable area of a page
+    my_xid *ptr;          // next xid will be written here
+    int size, free;       // max and current number of free xid slots on the page
+    int waiters;          // number of waiters on condition
+    PAGE_STATE state;     // see above
+    pthread_mutex_t lock; // to access page data or control structure
+    pthread_cond_t  cond; // to wait for a sync
+  } PAGE;
+
+  char logname[FN_REFLEN];
+  File fd;
+  my_off_t file_length;
+  uint npages, inited;
+  uchar *data;
+  struct st_page *pages, *syncing, *active, *pool, **pool_last_ptr;
+  /*
+    note that, e.g. LOCK_active is only used to protect
+    'active' pointer, to protect the content of the active page
+    one has to use active->lock.
+    Same for LOCK_pool and LOCK_sync
+  */
+  pthread_mutex_t LOCK_active, LOCK_pool, LOCK_sync;
+  pthread_cond_t COND_pool, COND_active;
+
+  public:
+  TC_LOG_MMAP(): inited(0) {}
+  int open(const char *opt_name);
+  void close();
+  int log_xid(THD *thd, my_xid xid);
+  int unlog(ulong cookie, my_xid xid);
+  int recover();
+
+  private:
+  void get_active_from_pool();
+  int sync();
+  int overflow();
+};
+#else
+#define TC_LOG_MMAP TC_LOG_DUMMY
 #endif
 
 extern TC_LOG *tc_log;
+extern TC_LOG_MMAP tc_log_mmap;
 extern TC_LOG_DUMMY tc_log_dummy;
 
 /* log info errors */

