1 files changed, 1379 insertions, 0 deletions
diff --git a/percona/5.0.84-b18-20090811/innodb_io_patches.patch b/percona/5.0.84-b18-20090811/innodb_io_patches.patch
new file mode 100644
index 0000000..aaef29a
--- /dev/null
+++ b/percona/5.0.84-b18-20090811/innodb_io_patches.patch
@@ -0,0 +1,1379 @@
+diff -ruN a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c
+--- a/innobase/buf/buf0flu.c	2009-05-08 06:12:03.000000000 +0900
++++ b/innobase/buf/buf0flu.c	2009-07-02 16:44:49.000000000 +0900
+@@ -898,10 +898,17 @@
+ 
+ 				old_page_count = page_count;
+ 				
++				if (srv_flush_neighbor_pages) {
+ 				/* Try to flush also all the neighbors */
+ 				page_count +=
+ 					buf_flush_try_neighbors(space, offset,
+ 								flush_type);
++				} else {
++				/* Try to flush the page only */
++				page_count +=
++					buf_flush_try_page(space, offset,
++							   flush_type);
++				}
+ 				/* fprintf(stderr,
+ 				"Flush type %lu, page no %lu, neighb %lu\n",
+ 				flush_type, offset,
+diff -ruN a/innobase/buf/buf0rea.c b/innobase/buf/buf0rea.c
+--- a/innobase/buf/buf0rea.c	2009-07-02 16:43:23.000000000 +0900
++++ b/innobase/buf/buf0rea.c	2009-07-02 16:44:49.000000000 +0900
+@@ -20,6 +20,7 @@
+ #include "os0file.h"
+ #include "srv0start.h"
+ 
++extern uint srv_read_ahead;
+ extern ulint srv_read_ahead_rnd;
+ extern ulint srv_read_ahead_seq;
+ extern ulint srv_buf_pool_reads;
+@@ -189,6 +190,10 @@
+ 	ulint		err;
+ 	ulint		i;
+ 
++	if (!(srv_read_ahead & 1)) {
++		return(0);
++	}
++
+ 	if (srv_startup_is_before_trx_rollback_phase) {
+ 	        /* No read-ahead to avoid thread deadlocks */
+ 	        return(0);
+@@ -396,6 +401,10 @@
+ 	ulint		err;
+ 	ulint		i;
+ 	
++	if (!(srv_read_ahead & 2)) {
++		return(0);
++	}
++
+ 	if (srv_startup_is_before_trx_rollback_phase) {
+ 	        /* No read-ahead to avoid thread deadlocks */
+ 	        return(0);
+diff -ruN a/innobase/ibuf/ibuf0ibuf.c b/innobase/ibuf/ibuf0ibuf.c
+--- a/innobase/ibuf/ibuf0ibuf.c	2009-05-08 06:12:04.000000000 +0900
++++ b/innobase/ibuf/ibuf0ibuf.c	2009-07-02 16:44:49.000000000 +0900
+@@ -370,8 +370,9 @@
+ 	grow in size, as the references on the upper levels of the tree can
+ 	change */
+ 	
+-	ibuf->max_size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE
+-						/ IBUF_POOL_SIZE_PER_MAX_SIZE;
++	ibuf->max_size = ut_min( buf_pool_get_curr_size() / UNIV_PAGE_SIZE
++		/ IBUF_POOL_SIZE_PER_MAX_SIZE, (ulint) srv_ibuf_max_size / UNIV_PAGE_SIZE);
++	srv_ibuf_max_size = (long long) ibuf->max_size * UNIV_PAGE_SIZE;
+ 	ibuf->meter = IBUF_THRESHOLD + 1;
+ 
+ 	UT_LIST_INIT(ibuf->data_list);
+@@ -2258,11 +2259,13 @@
+ 
+ 	mutex_enter(&ibuf_mutex);
+ 
++	if (!srv_ibuf_active_contract) {
+ 	if (ibuf->size < ibuf->max_size + IBUF_CONTRACT_ON_INSERT_NON_SYNC) {
+ 		mutex_exit(&ibuf_mutex);
+ 
+ 		return;
+ 	}
++	}
+ 
+ 	sync = FALSE;
+ 	
+diff -ruN a/innobase/include/log0log.h b/innobase/include/log0log.h
+--- a/innobase/include/log0log.h	2009-05-08 06:12:06.000000000 +0900
++++ b/innobase/include/log0log.h	2009-07-02 16:44:49.000000000 +0900
+@@ -169,6 +169,13 @@
+ log_buffer_flush_to_disk(void);
+ /*==========================*/
+ /********************************************************************
++Flushes the log buffer. Forces it to disk depending on the value of
++the configuration parameter innodb_flush_log_at_trx_commit. */
++
++void
++log_buffer_flush_maybe_sync(void);
++/*=============================*/
++/********************************************************************
+ Advances the smallest lsn for which there are unflushed dirty blocks in the
+ buffer pool and also may make a new checkpoint. NOTE: this function may only
+ be called if the calling thread owns no synchronization objects! */
+diff -ruN a/innobase/include/os0file.h b/innobase/include/os0file.h
+--- a/innobase/include/os0file.h	2009-07-02 16:43:23.000000000 +0900
++++ b/innobase/include/os0file.h	2009-07-02 16:44:49.000000000 +0900
+@@ -551,8 +551,10 @@
+ /*========*/
+ 	ulint	n,		/* in: maximum number of pending aio operations
+ 				allowed; n must be divisible by n_segments */
+-	ulint	n_segments,	/* in: combined number of segments in the four
+-				first aio arrays; must be >= 4 */
++//	ulint	n_segments,	/* in: combined number of segments in the four
++//				first aio arrays; must be >= 4 */
++	ulint	n_read_threads,  /* n_segments == 2 + n_read_threads + n_write_threads */
++	ulint	n_write_threads, /**/
+ 	ulint	n_slots_sync);	/* in: number of slots in the sync aio array */
+ /***********************************************************************
+ Requests an asynchronous i/o operation. */
+diff -ruN a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h
+--- a/innobase/include/srv0srv.h	2009-07-02 16:43:23.000000000 +0900
++++ b/innobase/include/srv0srv.h	2009-07-02 18:02:38.000000000 +0900
+@@ -89,6 +89,8 @@
+ extern ulint	srv_lock_table_size;
+ 
+ extern ulint	srv_n_file_io_threads;
++extern ulint	srv_n_read_io_threads;
++extern ulint	srv_n_write_io_threads;
+ 
+ #ifdef UNIV_LOG_ARCHIVE
+ extern ibool	srv_log_archive_on;
+@@ -133,6 +135,15 @@
+ extern ulong	srv_max_purge_lag;
+ extern ibool	srv_use_awe;
+ extern ibool	srv_use_adaptive_hash_indexes;
++
++extern ulint	srv_io_capacity;
++extern long long	srv_ibuf_max_size;
++extern ulint	srv_ibuf_active_contract;
++extern ulint	srv_ibuf_accel_rate;
++extern ulint	srv_flush_neighbor_pages;
++extern ulint	srv_enable_unsafe_group_commit;
++extern uint	srv_read_ahead;
++extern uint	srv_adaptive_checkpoint;
+ /*-------------------------------------------*/
+ 
+ extern ulint	srv_n_rows_inserted;
+diff -ruN a/innobase/log/log0log.c b/innobase/log/log0log.c
+--- a/innobase/log/log0log.c	2009-05-08 06:12:10.000000000 +0900
++++ b/innobase/log/log0log.c	2009-07-02 16:44:49.000000000 +0900
+@@ -1524,6 +1524,29 @@
+ }
+ 
+ /********************************************************************
++Flush the log buffer. Force it to disk depending on the value of
++innodb_flush_log_at_trx_commit. */
++
++void
++log_buffer_flush_maybe_sync(void)
++/*=============================*/
++{
++	dulint	lsn;
++
++	mutex_enter(&(log_sys->mutex));
++
++	lsn = log_sys->lsn;
++
++	mutex_exit(&(log_sys->mutex));
++
++	/* Force log buffer to disk when innodb_flush_log_at_trx_commit = 1. */
++	log_write_up_to(lsn, LOG_WAIT_ALL_GROUPS,
++			srv_flush_log_at_trx_commit == 1 ? TRUE : FALSE,
++			srv_flush_log_at_trx_commit == 1 ?
++				LOG_WRITE_FROM_BACKGROUND_SYNC :
++				LOG_WRITE_FROM_BACKGROUND_ASYNC);
++}
++/********************************************************************
+ Tries to establish a big enough margin of free space in the log buffer, such
+ that a new log entry can be catenated without an immediate need for a flush. */
+ static
+@@ -3326,6 +3349,15 @@
+ 			(ulong) ut_dulint_get_high(log_sys->last_checkpoint_lsn),
+ 			(ulong) ut_dulint_get_low(log_sys->last_checkpoint_lsn));
+ 
++	fprintf(file,
++		"Max checkpoint age  %lu\n"
++		"Modified age        %lu\n"
++		"Checkpoint age      %lu\n",
++			(ulong) log_sys->max_checkpoint_age,
++			(ulong) ut_dulint_minus(log_sys->lsn,
++					log_buf_pool_get_oldest_modification()),
++			(ulong) ut_dulint_minus(log_sys->lsn, log_sys->last_checkpoint_lsn));
++
+ 	current_time = time(NULL);
+ 			
+ 	time_elapsed = 0.001 + difftime(current_time,
+diff -ruN a/innobase/os/os0file.c b/innobase/os/os0file.c
+--- a/innobase/os/os0file.c	2009-07-02 16:43:23.000000000 +0900
++++ b/innobase/os/os0file.c	2009-07-02 16:44:49.000000000 +0900
+@@ -66,6 +66,28 @@
+ 
+ ibool	os_aio_print_debug	= FALSE;
+ 
++/* State for the state of an IO request in simulated AIO.
++   Protocol for simulated aio:
++     client requests IO: find slot with reserved = FALSE. Add entry with
++                         status = OS_AIO_NOT_ISSUED.
++     IO thread wakes: find adjacent slots with reserved = TRUE and status =
++                      OS_AIO_NOT_ISSUED. Change status for slots to
++                      OS_AIO_ISSUED.
++     IO operation completes: set status for slots to OS_AIO_DONE. set status
++                             for the first slot to OS_AIO_CLAIMED and return
++                             result for that slot.
++   When there are multiple read and write threads, they all compete to execute
++   the requests in the array (os_aio_array_t). This avoids the need to load
++   balance requests at the time the request is made at the cost of waking all
++   threads when a request is available.
++*/
++typedef enum {
++	OS_AIO_NOT_ISSUED, /* Available to be processed by an IO thread. */
++	OS_AIO_ISSUED,     /* Being processed by an IO thread. */
++	OS_AIO_DONE,       /* Request processed. */
++	OS_AIO_CLAIMED     /* Result being returned to client. */
++} os_aio_status;
++
+ /* The aio array slot structure */
+ typedef struct os_aio_slot_struct	os_aio_slot_t;
+ 
+@@ -74,6 +96,8 @@
+ 	ulint		pos;		/* index of the slot in the aio
+ 					array */
+ 	ibool		reserved;	/* TRUE if this slot is reserved */
++	os_aio_status   status;		/* Status for current request. Valid when reserved
++					is TRUE. Used only in simulated aio. */
+ 	time_t		reservation_time;/* time when reserved */
+ 	ulint		len;		/* length of the block to read or
+ 					write */
+@@ -84,11 +108,11 @@
+ 	ulint		offset_high;	/* 32 high bits of file offset */
+ 	os_file_t	file;		/* file where to read or write */
+ 	const char*	name;		/* file name or path */
+-	ibool		io_already_done;/* used only in simulated aio:
+-					TRUE if the physical i/o already
+-					made and only the slot message
+-					needs to be passed to the caller
+-					of os_aio_simulated_handle */
++//	ibool		io_already_done;/* used only in simulated aio:
++//					TRUE if the physical i/o already
++//					made and only the slot message
++//					needs to be passed to the caller
++//					of os_aio_simulated_handle */
+ 	fil_node_t*	message1;	/* message which is given by the */
+ 	void*		message2;	/* the requester of an aio operation
+ 					and which can be used to identify
+@@ -137,6 +161,13 @@
+ /* Array of events used in simulated aio */
+ os_event_t*	os_aio_segment_wait_events	= NULL;
+ 
++/* Number for the first global segment for reading. */
++const ulint os_aio_first_read_segment = 2;
++
++/* Number for the first global segment for writing. Set to
++2 + os_aio_read_write_threads. */
++ulint os_aio_first_write_segment = 0;
++
+ /* The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
+ are NULL when the module has not yet been initialized. */
+ static os_aio_array_t*	os_aio_read_array	= NULL;
+@@ -145,11 +176,17 @@
+ static os_aio_array_t*	os_aio_log_array	= NULL;
+ static os_aio_array_t*	os_aio_sync_array	= NULL;
+ 
++/* Per thread buffer used for merged IO requests. Used by
++os_aio_simulated_handle so that a buffer doesn't have to be allocated
++for each request. */
++static char* os_aio_thread_buffer[SRV_MAX_N_IO_THREADS];
++static ulint os_aio_thread_buffer_size[SRV_MAX_N_IO_THREADS];
++
+ static ulint	os_aio_n_segments	= ULINT_UNDEFINED;
+ 
+ /* If the following is TRUE, read i/o handler threads try to
+ wait until a batch of new read requests have been posted */
+-static ibool	os_aio_recommend_sleep_for_read_threads	= FALSE;
++static volatile ibool	os_aio_recommend_sleep_for_read_threads	= FALSE;
+ 
+ ulint	os_n_file_reads		= 0;
+ ulint	os_bytes_read_since_printout = 0;
+@@ -2878,8 +2915,10 @@
+ /*========*/
+ 	ulint	n,		/* in: maximum number of pending aio operations
+ 				allowed; n must be divisible by n_segments */
+-	ulint	n_segments,	/* in: combined number of segments in the four
+-				first aio arrays; must be >= 4 */
++//	ulint	n_segments,	/* in: combined number of segments in the four
++//				first aio arrays; must be >= 4 */
++	ulint	n_read_threads,  /* n_segments == 2 + n_read_threads + n_write_threads*/
++	ulint	n_write_threads, /**/
+ 	ulint	n_slots_sync)	/* in: number of slots in the sync aio array */
+ {
+ 	ulint	n_read_segs;
+@@ -2889,6 +2928,8 @@
+ #ifdef POSIX_ASYNC_IO
+ 	sigset_t   sigset;
+ #endif
++	ulint	n_segments = 2 + n_read_threads + n_write_threads;
++
+ 	ut_ad(n % n_segments == 0);
+ 	ut_ad(n_segments >= 4);
+ 
+@@ -2896,14 +2937,17 @@
+ 
+ 	for (i = 0; i < n_segments; i++) {
+ 	        srv_set_io_thread_op_info(i, "not started yet");
++		os_aio_thread_buffer[i] = 0;
++		os_aio_thread_buffer_size[i] = 0;
+ 	}
+ 
+ 	n_per_seg = n / n_segments;
+-	n_write_segs = (n_segments - 2) / 2;
+-	n_read_segs = n_segments - 2 - n_write_segs;
++	n_write_segs = n_write_threads;
++	n_read_segs = n_read_threads;
+ 	
+ 	/* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */
+ 
++	os_aio_first_write_segment = os_aio_first_read_segment + n_read_threads;
+ 	os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
+ 
+ 	srv_io_thread_function[0] = "insert buffer thread";
+@@ -2912,14 +2956,14 @@
+ 
+ 	srv_io_thread_function[1] = "log thread";
+ 
+-	os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg,
++	os_aio_read_array = os_aio_array_create(n_per_seg,
+ 							n_read_segs);
+ 	for (i = 2; i < 2 + n_read_segs; i++) {
+ 		ut_a(i < SRV_MAX_N_IO_THREADS);
+ 	        srv_io_thread_function[i] = "read thread";
+ 	}
+ 
+-	os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg,
++	os_aio_write_array = os_aio_array_create(n_per_seg,
+ 							n_write_segs);
+ 	for (i = 2 + n_read_segs; i < n_segments; i++) {
+ 		ut_a(i < SRV_MAX_N_IO_THREADS);
+@@ -3181,6 +3225,13 @@
+ 	struct aiocb*	control;
+ #endif
+ 	ulint		i;
++	ulint		prim_segment;
++	ulint		n;
++
++	n = array->n_slots / array->n_segments;
++	/* 64 blocks' striping ( aligning max(BUF_READ_AHEAD_AREA) ) */
++	prim_segment = ( offset >> (UNIV_PAGE_SIZE_SHIFT + 6) ) % (array->n_segments);
++
+ loop:
+ 	os_mutex_enter(array->mutex);
+ 
+@@ -3199,6 +3250,16 @@
+ 		goto loop;
+ 	}
+ 
++	for (i = prim_segment * n; i < array->n_slots; i++) {
++		slot = os_aio_array_get_nth_slot(array, i);
++
++		if (slot->reserved == FALSE) {
++			break;
++		}
++	}
++
++	if (slot->reserved == TRUE){
++		/* Not found after the intended segment. So we should search before. */
+ 	for (i = 0;; i++) {
+ 		slot = os_aio_array_get_nth_slot(array, i);
+ 
+@@ -3206,6 +3267,7 @@
+ 			break;
+ 		}
+ 	}
++	}
+ 
+ 	array->n_reserved++;
+ 
+@@ -3228,7 +3290,8 @@
+ 	slot->buf      = buf;
+ 	slot->offset   = offset;
+ 	slot->offset_high = offset_high;
+-	slot->io_already_done = FALSE;
++//	slot->io_already_done = FALSE;
++	slot->status = OS_AIO_NOT_ISSUED;
+ 	
+ #ifdef WIN_ASYNC_IO		
+ 	control = &(slot->control);
+@@ -3281,6 +3344,7 @@
+ 	ut_ad(slot->reserved);
+ 	
+ 	slot->reserved = FALSE;
++	slot->status = OS_AIO_NOT_ISSUED;
+ 
+ 	array->n_reserved--;
+ 
+@@ -3317,16 +3381,18 @@
+ 
+ 	segment = os_aio_get_array_and_local_segment(&array, global_segment);
+ 
+-	n = array->n_slots / array->n_segments;
++	n = array->n_slots;
+ 
+ 	/* Look through n slots after the segment * n'th slot */
+ 
+ 	os_mutex_enter(array->mutex);
+ 
+ 	for (i = 0; i < n; i++) {
+-		slot = os_aio_array_get_nth_slot(array, i + segment * n);
++		slot = os_aio_array_get_nth_slot(array, i);
+ 
+-		if (slot->reserved) {
++		if (slot->reserved &&
++		    (slot->status == OS_AIO_NOT_ISSUED ||
++		     slot->status == OS_AIO_DONE)) {
+ 			/* Found an i/o request */
+ 			
+ 			break;
+@@ -3336,7 +3402,25 @@
+ 	os_mutex_exit(array->mutex);
+ 
+ 	if (i < n) {
+-		os_event_set(os_aio_segment_wait_events[global_segment]);
++		if (array == os_aio_ibuf_array) {
++			os_event_set(os_aio_segment_wait_events[0]);
++
++		} else if (array == os_aio_log_array) {
++			os_event_set(os_aio_segment_wait_events[1]);
++
++		} else if (array == os_aio_read_array) {
++			ulint	x;
++			for (x = os_aio_first_read_segment; x < os_aio_first_write_segment; x++)
++				os_event_set(os_aio_segment_wait_events[x]);
++
++		} else if (array == os_aio_write_array) {
++			ulint	x;
++			for (x = os_aio_first_write_segment; x < os_aio_n_segments; x++)
++				os_event_set(os_aio_segment_wait_events[x]);
++
++		} else {
++			ut_a(0);
++		}
+ 	}
+ }
+ 
+@@ -3347,8 +3431,6 @@
+ os_aio_simulated_wake_handler_threads(void)
+ /*=======================================*/
+ {
+-	ulint	i;
+-
+ 	if (os_aio_use_native_aio) {
+ 		/* We do not use simulated aio: do nothing */
+ 
+@@ -3357,9 +3439,10 @@
+ 
+ 	os_aio_recommend_sleep_for_read_threads	= FALSE;
+ 
+-	for (i = 0; i < os_aio_n_segments; i++) {
+-		os_aio_simulated_wake_handler_thread(i);
+-	}
++	os_aio_simulated_wake_handler_thread(0);
++	os_aio_simulated_wake_handler_thread(1);
++	os_aio_simulated_wake_handler_thread(os_aio_first_read_segment);
++	os_aio_simulated_wake_handler_thread(os_aio_first_write_segment);
+ }
+ 
+ /**************************************************************************
+@@ -3640,7 +3723,7 @@
+ 	ut_ad(os_aio_validate());
+ 	ut_ad(segment < array->n_segments);
+ 
+-	n = array->n_slots / array->n_segments;
++	n = array->n_slots;
+ 
+ 	if (array == os_aio_sync_array) {
+ 		os_event_wait(os_aio_array_get_nth_slot(array, pos)->event);
+@@ -3648,12 +3731,12 @@
+ 	} else {
+ 		srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
+ 		i = os_event_wait_multiple(n,
+-				(array->native_events) + segment * n);
++				(array->native_events));
+ 	}
+ 
+ 	os_mutex_enter(array->mutex);
+ 
+-	slot = os_aio_array_get_nth_slot(array, i + segment * n);
++	slot = os_aio_array_get_nth_slot(array, i);
+ 
+ 	ut_a(slot->reserved);
+ 
+@@ -3830,10 +3913,13 @@
+ 	os_aio_slot_t*	slot;
+ 	os_aio_slot_t*	slot2;
+ 	os_aio_slot_t*	consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
++	os_aio_slot_t*	lowest_request;
++	os_aio_slot_t*	oldest_request;
+ 	ulint		n_consecutive;
+ 	ulint		total_len;
+ 	ulint		offs;
+ 	ulint		lowest_offset;
++	ulint		oldest_offset;
+ 	ulint		biggest_age;
+ 	ulint		age;
+ 	byte*		combined_buf;
+@@ -3841,6 +3927,7 @@
+ 	ibool		ret;
+ 	ulint		n;
+ 	ulint		i;
++	time_t		now;
+ 	
+ 	segment = os_aio_get_array_and_local_segment(&array, global_segment);
+ 	
+@@ -3853,7 +3940,7 @@
+ 	ut_ad(os_aio_validate());
+ 	ut_ad(segment < array->n_segments);
+ 
+-	n = array->n_slots / array->n_segments;
++	n = array->n_slots;
+ 
+ 	/* Look through n slots after the segment * n'th slot */
+ 
+@@ -3875,9 +3962,9 @@
+ 	done */
+ 	
+ 	for (i = 0; i < n; i++) {
+-		slot = os_aio_array_get_nth_slot(array, i + segment * n);
++		slot = os_aio_array_get_nth_slot(array, i);
+ 
+-		if (slot->reserved && slot->io_already_done) {
++		if (slot->reserved && slot->status == OS_AIO_DONE) {
+ 
+ 			if (os_aio_print_debug) {
+ 				fprintf(stderr,
+@@ -3897,67 +3984,57 @@
+ 	then pick the one at the lowest offset. */
+ 
+ 	biggest_age = 0;
+-	lowest_offset = ULINT_MAX;
++	now = time(NULL);
++	oldest_request = lowest_request = NULL;
++	oldest_offset = lowest_offset = ULINT_MAX;
+ 
++	/* Find the oldest request and the request with the smallest offset */
+ 	for (i = 0; i < n; i++) {
+-		slot = os_aio_array_get_nth_slot(array, i + segment * n);
++		slot = os_aio_array_get_nth_slot(array, i);
+ 
+-		if (slot->reserved) {
+-		        age = (ulint)difftime(time(NULL),
+-						slot->reservation_time);
++		if (slot->reserved && slot->status == OS_AIO_NOT_ISSUED) {
++			age = (ulint)difftime(now, slot->reservation_time);
+ 
+ 			if ((age >= 2 && age > biggest_age)
+ 			    || (age >= 2 && age == biggest_age
+-			        && slot->offset < lowest_offset)) {
++				&& slot->offset < oldest_offset)) {
+ 
+ 			        /* Found an i/o request */
+-				consecutive_ios[0] = slot;
+-
+-				n_consecutive = 1;
+-
+ 				biggest_age = age;
+-				lowest_offset = slot->offset;
++				oldest_request = slot;
++				oldest_offset = slot->offset;
+ 			}
+-		}
+-	}
+-
+-	if (n_consecutive == 0) {
+-	        /* There were no old requests. Look for an i/o request at the
+-		lowest offset in the array (we ignore the high 32 bits of the
+-		offset in these heuristics) */
+-
+-		lowest_offset = ULINT_MAX;
+-	
+-		for (i = 0; i < n; i++) {
+-		        slot = os_aio_array_get_nth_slot(array,
+-							i + segment * n);
+-
+-			if (slot->reserved && slot->offset < lowest_offset) {
+ 
++			/* Look for an i/o request at the lowest offset in the array
++			 * (we ignore the high 32 bits of the offset) */
++			if (slot->offset < lowest_offset) {
+ 			        /* Found an i/o request */
+-				consecutive_ios[0] = slot;
+-
+-				n_consecutive = 1;
+-
++				lowest_request = slot;
+ 				lowest_offset = slot->offset;
+ 			}
+ 		}
+ 	}
+ 
+-	if (n_consecutive == 0) {
++	if (!lowest_request && !oldest_request) {
+ 
+ 		/* No i/o requested at the moment */
+ 
+ 		goto wait_for_io;
+ 	}
+ 
+-	slot = consecutive_ios[0];
++	if (oldest_request) {
++		slot = oldest_request;
++	} else {
++		slot = lowest_request;
++	}
++	consecutive_ios[0] = slot;
++	n_consecutive = 1;
+ 
+ 	/* Check if there are several consecutive blocks to read or write */
+ 
+ consecutive_loop:	
+ 	for (i = 0; i < n; i++) {
+-		slot2 = os_aio_array_get_nth_slot(array, i + segment * n);
++		slot2 = os_aio_array_get_nth_slot(array, i);
+ 
+ 		if (slot2->reserved && slot2 != slot
+ 		    && slot2->offset == slot->offset + slot->len
+@@ -3965,7 +4042,8 @@
+ 						sum does not wrap over */
+ 		    && slot2->offset_high == slot->offset_high
+ 		    && slot2->type == slot->type
+-		    && slot2->file == slot->file) {
++		    && slot2->file == slot->file
++		    && slot2->status == OS_AIO_NOT_ISSUED) {
+ 
+ 			/* Found a consecutive i/o request */
+ 
+@@ -3994,6 +4072,8 @@
+ 	
+ 	for (i = 0; i < n_consecutive; i++) {
+ 		total_len += consecutive_ios[i]->len;
++		ut_a(consecutive_ios[i]->status == OS_AIO_NOT_ISSUED);
++		consecutive_ios[i]->status = OS_AIO_ISSUED;
+ 	}
+ 
+ 	if (n_consecutive == 1) {
+@@ -4001,7 +4081,14 @@
+ 		combined_buf = slot->buf;
+ 		combined_buf2 = NULL;
+ 	} else {
+-		combined_buf2 = ut_malloc(total_len + UNIV_PAGE_SIZE);
++		if ((total_len + UNIV_PAGE_SIZE) > os_aio_thread_buffer_size[global_segment]) {
++			if (os_aio_thread_buffer[global_segment])
++				ut_free(os_aio_thread_buffer[global_segment]);
++
++			os_aio_thread_buffer[global_segment] = ut_malloc(total_len + UNIV_PAGE_SIZE);
++			os_aio_thread_buffer_size[global_segment] = total_len + UNIV_PAGE_SIZE;
++		}
++		combined_buf2 = os_aio_thread_buffer[global_segment];
+ 
+ 		ut_a(combined_buf2);
+ 
+@@ -4012,6 +4099,9 @@
+ 	this assumes that there is just one i/o-handler thread serving
+ 	a single segment of slots! */
+ 
++	ut_a(slot->reserved);
++	ut_a(slot->status == OS_AIO_ISSUED);
++
+ 	os_mutex_exit(array->mutex);
+ 
+ 	if (slot->type == OS_FILE_WRITE && n_consecutive > 1) {
+@@ -4081,16 +4171,13 @@
+ 		}
+ 	}
+ 
+-	if (combined_buf2) {
+-		ut_free(combined_buf2);
+-	}
+-
+ 	os_mutex_enter(array->mutex);
+ 
+ 	/* Mark the i/os done in slots */
+ 
+ 	for (i = 0; i < n_consecutive; i++) {
+-		consecutive_ios[i]->io_already_done = TRUE;
++		ut_a(consecutive_ios[i]->status == OS_AIO_ISSUED);
++		consecutive_ios[i]->status = OS_AIO_DONE;
+ 	}
+ 
+ 	/* We return the messages for the first slot now, and if there were
+@@ -4100,6 +4187,8 @@
+ slot_io_done:
+ 
+ 	ut_a(slot->reserved);
++	ut_a(slot->status == OS_AIO_DONE);
++	slot->status = OS_AIO_CLAIMED;
+ 
+ 	*message1 = slot->message1;
+ 	*message2 = slot->message2;
+diff -ruN a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c
+--- a/innobase/srv/srv0srv.c	2009-07-02 16:43:23.000000000 +0900
++++ b/innobase/srv/srv0srv.c	2009-07-02 18:36:54.000000000 +0900
+@@ -167,6 +167,8 @@
+ ulint	srv_lock_table_size	= ULINT_MAX;
+ 
+ ulint	srv_n_file_io_threads	= ULINT_MAX;
++ulint	srv_n_read_io_threads	= 1;
++ulint	srv_n_write_io_threads	= 1;
+ 
+ #ifdef UNIV_LOG_ARCHIVE
+ ibool	srv_log_archive_on	= FALSE;
+@@ -330,6 +332,24 @@
+ ibool	srv_use_awe			= FALSE;
+ ibool	srv_use_adaptive_hash_indexes 	= TRUE;
+ 
++ulint	srv_io_capacity = 100;
++
++/* Returns the number of IO operations that is X percent of the capacity.
++PCT_IO(5) -> returns the number of IO operations that is 5% of the max
++where max is srv_io_capacity. */
++#define PCT_IO(pct) ((ulint) (srv_io_capacity * ((double) pct / 100.0)))
++
++long long	srv_ibuf_max_size = 0;
++ulint	srv_ibuf_active_contract = 0; /* 0:disable 1:enable */
++ulint	srv_ibuf_accel_rate = 100;
++#define PCT_IBUF_IO(pct) ((ulint) (srv_io_capacity * srv_ibuf_accel_rate * ((double) pct / 10000.0)))
++
++ulint	srv_flush_neighbor_pages = 1; /* 0:disable 1:enable */
++
++ulint	srv_enable_unsafe_group_commit = 0; /* 0:disable 1:enable */
++
++uint	srv_read_ahead = 3; /* 1: random  2: linear  3: Both */
++uint	srv_adaptive_checkpoint = 0; /* 0: none  1: reflex  2: estimate */
+ /*-------------------------------------------*/
+ ulong	srv_n_spin_wait_rounds	= 20;
+ ulong	srv_n_free_tickets_to_enter = 500;
+@@ -2228,6 +2248,10 @@
+ 	ulint		n_pend_ios;
+ 	ibool		skip_sleep	= FALSE;
+ 	ulint		i;
++
++	dulint		lsn_old;
++	
++	dulint		oldest_lsn;
+ 	
+ #ifdef UNIV_DEBUG_THREAD_CREATION
+ 	fprintf(stderr, "Master thread starts, id %lu\n",
+@@ -2244,6 +2268,9 @@
+ 
+ 	mutex_exit(&kernel_mutex);
+ 
++	mutex_enter(&(log_sys->mutex));
++	lsn_old = log_sys->lsn;
++	mutex_exit(&(log_sys->mutex));
+ 	os_event_set(srv_sys->operational);
+ loop:
+ 	/*****************************************************************/
+@@ -2279,6 +2306,18 @@
+ 		if (!skip_sleep) {
+ 
+ 		        os_thread_sleep(1000000);
++			/*
++			mutex_enter(&(log_sys->mutex));
++			oldest_lsn = buf_pool_get_oldest_modification();
++			dulint	lsn = log_sys->lsn;
++			mutex_exit(&(log_sys->mutex));
++
++			if (!ut_dulint_is_zero(oldest_lsn))
++			fprintf(stderr,
++				"InnoDB flush: age pct: %lu, lsn progress: %lu\n",
++				ut_dulint_minus(lsn, oldest_lsn) * 100 / log_sys->max_checkpoint_age,
++				ut_dulint_minus(lsn, lsn_old));
++			*/
+ 		}
+ 
+ 		skip_sleep = FALSE;
+@@ -2317,13 +2356,14 @@
+ 						+ log_sys->n_pending_writes;
+ 		n_ios = log_sys->n_log_ios + buf_pool->n_pages_read
+ 						+ buf_pool->n_pages_written;
+-		if (n_pend_ios < 3 && (n_ios - n_ios_old < 5)) {
++		if (n_pend_ios < PCT_IO(3) && (n_ios - n_ios_old < PCT_IO(5))) {
+ 			srv_main_thread_op_info = "doing insert buffer merge";
+-			ibuf_contract_for_n_pages(TRUE, 5);
++			ibuf_contract_for_n_pages(TRUE, PCT_IBUF_IO(5));
+ 
+ 			srv_main_thread_op_info = "flushing log";
+ 
+-			log_buffer_flush_to_disk();
++			/* No fsync when srv_flush_log_at_trx_commit != 1 */
++			log_buffer_flush_maybe_sync();
+ 		}
+ 
+ 		if (buf_get_modified_ratio_pct() >
+@@ -2332,7 +2372,7 @@
+ 			/* Try to keep the number of modified pages in the
+ 			buffer pool under the limit wished by the user */
+ 			
+-			n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100,
++			n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100),
+ 							  ut_dulint_max);
+ 
+ 		        /* If we had to do the flush, it may have taken
+@@ -2341,6 +2381,140 @@
+ 			iteration of this loop. */
+ 			     
+ 			skip_sleep = TRUE;
++			mutex_enter(&(log_sys->mutex));
++			lsn_old = log_sys->lsn;
++			mutex_exit(&(log_sys->mutex));
++		} else if (srv_adaptive_checkpoint == 1) {
++
++			/* Try to keep modified age not to exceed
++			max_checkpoint_age * 7/8 line */
++
++			mutex_enter(&(log_sys->mutex));
++			lsn_old = log_sys->lsn;
++			oldest_lsn = buf_pool_get_oldest_modification();
++			if (ut_dulint_is_zero(oldest_lsn)) {
++
++				mutex_exit(&(log_sys->mutex));
++
++			} else {
++				if (ut_dulint_minus(log_sys->lsn, oldest_lsn)
++				    > (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 8)) {
++					/* LOG_POOL_PREFLUSH_RATIO_ASYNC is exceeded. */
++					/* We should not flush from here. */
++					mutex_exit(&(log_sys->mutex));
++				} else if (ut_dulint_minus(log_sys->lsn, oldest_lsn)
++				    > (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 4)) {
++
++					/* 2nd defence line (max_checkpoint_age * 3/4) */
++
++					mutex_exit(&(log_sys->mutex));
++
++					n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100),
++									  ut_dulint_max);
++					skip_sleep = TRUE;
++				} else if (ut_dulint_minus(log_sys->lsn, oldest_lsn)
++					   > (log_sys->max_checkpoint_age)/2 ) {
++
++					/* 1st defence line (max_checkpoint_age * 1/2) */
++
++					mutex_exit(&(log_sys->mutex));
++
++					n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(10),
++									  ut_dulint_max);
++					skip_sleep = TRUE;
++				} else {
++					mutex_exit(&(log_sys->mutex));
++				}
++			}
++		} else if (srv_adaptive_checkpoint == 2) {
++
++			/* Try to keep modified age not to exceed
++			max_checkpoint_age * 7/8 line */
++
++			mutex_enter(&(log_sys->mutex));
++
++			oldest_lsn = buf_pool_get_oldest_modification();
++			if (ut_dulint_is_zero(oldest_lsn)) {
++				lsn_old = log_sys->lsn;
++				mutex_exit(&(log_sys->mutex));
++
++			} else {
++				if (ut_dulint_minus(log_sys->lsn, oldest_lsn)
++				    > (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 8)) {
++					/* LOG_POOL_PREFLUSH_RATIO_ASYNC is exceeded. */
++					/* We should not flush from here. */
++					lsn_old = log_sys->lsn;
++					mutex_exit(&(log_sys->mutex));
++				} else if (ut_dulint_minus(log_sys->lsn, oldest_lsn)
++					   > (log_sys->max_checkpoint_age)/2 ) {
++
++					/* defence line (max_checkpoint_age * 1/2) */
++					dulint	lsn = log_sys->lsn;
++
++					mutex_exit(&(log_sys->mutex));
++
++					ib_longlong level, bpl;
++					buf_block_t* bpage;
++
++					mutex_enter(&buf_pool->mutex);
++
++					level = 0;
++					bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
++
++					while (bpage != NULL) {
++						dulint	oldest_modification = bpage->oldest_modification;
++						if (!ut_dulint_is_zero(oldest_modification)) {
++							level += log_sys->max_checkpoint_age
++								 - ut_dulint_minus(lsn, oldest_modification);
++						}
++						bpage = UT_LIST_GET_NEXT(flush_list, bpage);
++					}
++
++					if (level) {
++						bpl = ((ib_longlong) UT_LIST_GET_LEN(buf_pool->flush_list)
++							* UT_LIST_GET_LEN(buf_pool->flush_list)
++							* ut_dulint_minus(lsn, lsn_old)) / level;
++					} else {
++						bpl = 0;
++					}
++
++					mutex_exit(&buf_pool->mutex);
++
++					if (!srv_use_doublewrite_buf) {
++						/* flush is faster than when doublewrite */
++						bpl = (bpl * 3) / 4;
++					}
++
++					if(bpl) {
++retry_flush_batch:
++						n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST,
++									bpl,
++									ut_dulint_add(oldest_lsn,
++										ut_dulint_minus(lsn,
++											lsn_old)));
++						if (n_pages_flushed == ULINT_UNDEFINED) {
++							os_thread_sleep(5000);
++							goto retry_flush_batch;
++						}
++					}
++
++					lsn_old = lsn;
++					/*
++					fprintf(stderr,
++						"InnoDB flush: age pct: %lu, lsn progress: %lu, blocks to flush:%llu\n",
++						ut_dulint_minus(lsn, oldest_lsn) * 100 / log_sys->max_checkpoint_age,
++						ut_dulint_minus(lsn, lsn_old), bpl);
++					*/
++				} else {
++					lsn_old = log_sys->lsn;
++					mutex_exit(&(log_sys->mutex));
++				}
++			}
++
++		} else {
++			mutex_enter(&(log_sys->mutex));
++			lsn_old = log_sys->lsn;
++			mutex_exit(&(log_sys->mutex));
+ 		}
+ 
+ 		if (srv_activity_count == old_activity_count) {
+@@ -2367,23 +2541,25 @@
+ 	n_pend_ios = buf_get_n_pending_ios() + log_sys->n_pending_writes;
+ 	n_ios = log_sys->n_log_ios + buf_pool->n_pages_read
+ 						+ buf_pool->n_pages_written;
+-	if (n_pend_ios < 3 && (n_ios - n_ios_very_old < 200)) {
++	if (n_pend_ios < 3 && (n_ios - n_ios_very_old < PCT_IO(200))) {
+ 
+ 		srv_main_thread_op_info = "flushing buffer pool pages";
+-		buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max);
++		buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), ut_dulint_max);
+ 
+ 		srv_main_thread_op_info = "flushing log";
+-		log_buffer_flush_to_disk();
++		/* No fsync when srv_flush_log_at_trx_commit != 1 */
++		log_buffer_flush_maybe_sync();
+ 	}
+ 
+ 	/* We run a batch of insert buffer merge every 10 seconds,
+ 	even if the server were active */
+ 
+ 	srv_main_thread_op_info = "doing insert buffer merge";
+-	ibuf_contract_for_n_pages(TRUE, 5);
++	ibuf_contract_for_n_pages(TRUE, PCT_IBUF_IO(5));
+ 
+ 	srv_main_thread_op_info = "flushing log";
+-	log_buffer_flush_to_disk();
++	/* No fsync when srv_flush_log_at_trx_commit != 1 */
++	log_buffer_flush_maybe_sync();
+ 
+ 	/* We run a full purge every 10 seconds, even if the server
+ 	were active */
+@@ -2422,14 +2598,14 @@
+ 		(> 70 %), we assume we can afford reserving the disk(s) for
+ 		the time it requires to flush 100 pages */
+ 
+-	        n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100,
++	        n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100),
+ 							ut_dulint_max);
+ 	} else {
+ 	        /* Otherwise, we only flush a small number of pages so that
+ 		we do not unnecessarily use much disk i/o capacity from
+ 		other work */
+ 
+-	        n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 10,
++	        n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(10),
+ 							ut_dulint_max);
+ 	}
+ 
+@@ -2518,7 +2694,7 @@
+ 	if (srv_fast_shutdown && srv_shutdown_state > 0) {
+ 	        n_bytes_merged = 0;
+ 	} else {
+-	        n_bytes_merged = ibuf_contract_for_n_pages(TRUE, 20);
++	        n_bytes_merged = ibuf_contract_for_n_pages(TRUE, PCT_IBUF_IO(100));
+ 	}
+ 
+ 	srv_main_thread_op_info = "reserving kernel mutex";
+@@ -2535,7 +2711,7 @@
+ 
+ 	if (srv_fast_shutdown < 2) {
+ 		n_pages_flushed =
+-			buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max);
++			buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), ut_dulint_max);
+ 	} else {
+ 		/* In the fastest shutdown we do not flush the buffer pool
+ 		to data files: we set n_pages_flushed to 0 artificially. */
+@@ -2557,7 +2733,14 @@
+ 
+ 	srv_main_thread_op_info = "flushing log";
+ 
+-	log_buffer_flush_to_disk();
++	current_time = time(NULL);
++	if (difftime(current_time, last_flush_time) > 1) {
++		log_buffer_flush_to_disk();
++		last_flush_time = current_time;
++	} else {
++		/* No fsync when srv_flush_log_at_trx_commit != 1 */
++		log_buffer_flush_maybe_sync();
++	}
+ 
+ 	srv_main_thread_op_info = "making checkpoint";
+ 
+diff -ruN a/innobase/srv/srv0start.c b/innobase/srv/srv0start.c
+--- a/innobase/srv/srv0start.c	2009-05-08 06:12:12.000000000 +0900
++++ b/innobase/srv/srv0start.c	2009-07-02 16:44:49.000000000 +0900
+@@ -1205,24 +1205,28 @@
+ 		return(DB_ERROR);
+ 	}
+ 
++	/* over write innodb_file_io_threads */
++	srv_n_file_io_threads = 2 + srv_n_read_io_threads + srv_n_write_io_threads;
++
+ 	/* Restrict the maximum number of file i/o threads */
+ 	if (srv_n_file_io_threads > SRV_MAX_N_IO_THREADS) {
+ 
+ 		srv_n_file_io_threads = SRV_MAX_N_IO_THREADS;
++		srv_n_read_io_threads = srv_n_write_io_threads = (SRV_MAX_N_IO_THREADS - 2) / 2;
+ 	}
+ 
+ 	if (!os_aio_use_native_aio) {
+  		/* In simulated aio we currently have use only for 4 threads */
+-		srv_n_file_io_threads = 4;
++		/*srv_n_file_io_threads = 4;*/
+ 
+ 		os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD
+ 						* srv_n_file_io_threads,
+-					srv_n_file_io_threads,
++					srv_n_read_io_threads, srv_n_write_io_threads,
+ 					SRV_MAX_N_PENDING_SYNC_IOS);
+ 	} else {
+ 		os_aio_init(SRV_N_PENDING_IOS_PER_THREAD
+ 						* srv_n_file_io_threads,
+-					srv_n_file_io_threads,
++					srv_n_read_io_threads, srv_n_write_io_threads,
+ 					SRV_MAX_N_PENDING_SYNC_IOS);
+ 	}
+ 	
+diff -ruN a/patch_info/innodb_io_patches.info b/patch_info/innodb_io_patches.info
+--- /dev/null	1970-01-01 09:00:00.000000000 +0900
++++ b/patch_info/innodb_io_patches.info	2009-07-02 16:44:49.000000000 +0900
+@@ -0,0 +1,11 @@
++File=innodb_io_patches.patch
++Name=Cluster of past InnoDB IO patches
++Version=1.1
++Author=Percona
++License=GPL
++Comment=This patch contains fixed (control_flush_and_merge_and_read, control_io-threads, adaptive_flush)
++ChangeLog=
++2008-11-06
++YK: Initial release
++2009-01-09
++YK: Some parameters are added
+diff -ruN a/sql/ha_innodb.cc b/sql/ha_innodb.cc
+--- a/sql/ha_innodb.cc	2009-07-02 16:43:23.000000000 +0900
++++ b/sql/ha_innodb.cc	2009-07-02 16:44:49.000000000 +0900
+@@ -149,6 +149,7 @@
+      innobase_lock_wait_timeout, innobase_force_recovery,
+      innobase_open_files;
+ 
++long innobase_read_io_threads, innobase_write_io_threads;
+ longlong innobase_buffer_pool_size, innobase_log_file_size;
+ 
+ /* The default values for the following char* start-up parameters
+@@ -1417,6 +1418,8 @@
+ 	srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size;
+ 
+ 	srv_n_file_io_threads = (ulint) innobase_file_io_threads;
++	srv_n_read_io_threads = (ulint) innobase_read_io_threads;
++	srv_n_write_io_threads = (ulint) innobase_write_io_threads;
+ 
+ 	srv_lock_wait_timeout = (ulint) innobase_lock_wait_timeout;
+ 	srv_force_recovery = (ulint) innobase_force_recovery;
+@@ -7330,6 +7333,10 @@
+         trx_t* trx = check_trx_exists(thd);
+ 
+         if (thd->lex->sql_command != SQLCOM_XA_PREPARE) {
++		if (srv_enable_unsafe_group_commit && !thd->variables.innodb_support_xa) {
++			/* choose group commit rather than binlog order */
++			return(0);
++		}
+ 
+                 /* For ibbackup to work the order of transactions in binlog
+                 and InnoDB must be the same. Consider the situation
+diff -ruN a/sql/ha_innodb.h b/sql/ha_innodb.h
+--- a/sql/ha_innodb.h	2009-07-02 16:43:23.000000000 +0900
++++ b/sql/ha_innodb.h	2009-07-02 18:10:51.000000000 +0900
+@@ -204,6 +204,7 @@
+ extern long innobase_additional_mem_pool_size;
+ extern long innobase_buffer_pool_awe_mem_mb;
+ extern long innobase_file_io_threads, innobase_lock_wait_timeout;
++extern long innobase_read_io_threads, innobase_write_io_threads;
+ extern long innobase_force_recovery;
+ extern long innobase_open_files;
+ extern char *innobase_data_home_dir, *innobase_data_file_path;
+@@ -234,6 +235,15 @@
+ extern ulong srv_thread_concurrency;
+ extern ulong srv_commit_concurrency;
+ extern ulong srv_flush_log_at_trx_commit;
++extern ulong srv_io_capacity;
++extern long long srv_ibuf_max_size;
++extern ulong srv_ibuf_active_contract;
++extern ulong srv_ibuf_accel_rate;
++extern ulong srv_flush_neighbor_pages;
++extern ulong srv_enable_unsafe_group_commit;
++extern uint srv_read_ahead;
++extern uint srv_adaptive_checkpoint;
++
+ /* An option to enable the fix for "Bug#43660 SHOW INDEXES/ANALYZE does
+ NOT update cardinality for indexes of InnoDB table". By default we are
+ running with the fix disabled because MySQL 5.1 is frozen for such
+diff -ruN a/sql/mysqld.cc b/sql/mysqld.cc
+--- a/sql/mysqld.cc	2009-07-02 16:43:23.000000000 +0900
++++ b/sql/mysqld.cc	2009-07-02 18:00:04.000000000 +0900
+@@ -5086,6 +5086,16 @@
+   OPT_INNODB_ROLLBACK_ON_TIMEOUT,
+   OPT_SECURE_FILE_PRIV,
+   OPT_KEEP_FILES_ON_CREATE,
++  OPT_INNODB_IO_CAPACITY,
++  OPT_INNODB_IBUF_MAX_SIZE,
++  OPT_INNODB_IBUF_ACTIVE_CONTRACT,
++  OPT_INNODB_IBUF_ACCEL_RATE,
++  OPT_INNODB_FLUSH_NEIGHBOR_PAGES,
++  OPT_INNODB_ENABLE_UNSAFE_GROUP_COMMIT,
++  OPT_INNODB_READ_AHEAD,
++  OPT_INNODB_ADAPTIVE_CHECKPOINT,
++  OPT_INNODB_READ_IO_THREADS,
++  OPT_INNODB_WRITE_IO_THREADS,
+   OPT_INNODB_ADAPTIVE_HASH_INDEX,
+   OPT_FEDERATED,
+   OPT_INNODB_USE_LEGACY_CARDINALITY_ALGORITHM
+@@ -5403,6 +5413,44 @@
+    (gptr*) &srv_use_legacy_cardinality_algorithm,
+    (gptr*) &srv_use_legacy_cardinality_algorithm,
+    0, GET_BOOL, OPT_ARG, 1, 0, 0, 0, 0, 0},
++  {"innodb_io_capacity", OPT_INNODB_IO_CAPACITY,
++   "Number of IO operations per second the server can do. Tunes background IO rate.",
++   (gptr*) &srv_io_capacity, (gptr*) &srv_io_capacity,
++   0, GET_ULONG, REQUIRED_ARG, 200, 100, 999999999, 0, 0, 0},
++  {"innodb_ibuf_max_size", OPT_INNODB_IBUF_MAX_SIZE,
++   "The maximum size of the insert buffer. (in bytes)",
++   (gptr*) &srv_ibuf_max_size, (gptr*) &srv_ibuf_max_size, 0,
++   GET_LL, REQUIRED_ARG, LONGLONG_MAX, 0, LONGLONG_MAX, 0, 0, 0},
++  {"innodb_ibuf_active_contract", OPT_INNODB_IBUF_ACTIVE_CONTRACT,
++   "Enable/Disable active_contract of insert buffer. 0:disable 1:enable",
++   (gptr*) &srv_ibuf_active_contract, (gptr*) &srv_ibuf_active_contract,
++   0, GET_ULONG, REQUIRED_ARG, 0, 0, 1, 0, 0, 0},
++  {"innodb_ibuf_accel_rate", OPT_INNODB_IBUF_ACCEL_RATE,
++   "Tunes amount of insert buffer processing of background, in addition to innodb_io_capacity. (in percentage)",
++   (gptr*) &srv_ibuf_accel_rate, (gptr*) &srv_ibuf_accel_rate,
++   0, GET_ULONG, REQUIRED_ARG, 100, 100, 999999999, 0, 0, 0},
++  {"innodb_flush_neighbor_pages", OPT_INNODB_FLUSH_NEIGHBOR_PAGES,
++   "Enable/Disable flushing also neighbor pages. 0:disable 1:enable",
++   (gptr*) &srv_flush_neighbor_pages, (gptr*) &srv_flush_neighbor_pages,
++   0, GET_ULONG, REQUIRED_ARG, 1, 0, 1, 0, 0, 0},
++  {"innodb_read_ahead", OPT_INNODB_READ_AHEAD,
++   "Control read ahead activity. (none, random, linear, [both])",
++   0, 0, 0, GET_ULONG, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
++  {"innodb_adaptive_checkpoint", OPT_INNODB_ADAPTIVE_CHECKPOINT,
++   "Enable/Diasable flushing along modified age. ([none], reflex, estimate)",
++   0, 0, 0, GET_ULONG, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
++  {"innodb_enable_unsafe_group_commit", OPT_INNODB_ENABLE_UNSAFE_GROUP_COMMIT,
++   "Enable/Disable unsafe group commit when support_xa=OFF and use with binlog or other XA storage engine.",
++   (gptr*) &srv_enable_unsafe_group_commit, (gptr*) &srv_enable_unsafe_group_commit,
++   0, GET_ULONG, REQUIRED_ARG, 0, 0, 1, 0, 0, 0},
++  {"innodb_read_io_threads", OPT_INNODB_READ_IO_THREADS,
++   "Number of background read I/O threads in InnoDB.",
++   (gptr*) &innobase_read_io_threads, (gptr*) &innobase_read_io_threads,
++   0, GET_LONG, REQUIRED_ARG, 8, 1, 64, 0, 0, 0},
++  {"innodb_write_io_threads", OPT_INNODB_WRITE_IO_THREADS,
++   "Number of background write I/O threads in InnoDB.",
++   (gptr*) &innobase_write_io_threads, (gptr*) &innobase_write_io_threads,
++   0, GET_LONG, REQUIRED_ARG, 8, 1, 64, 0, 0, 0},
+ #endif /* End HAVE_INNOBASE_DB */
+   {"isam", OPT_ISAM, "Obsolete. ISAM storage engine is no longer supported.",
+    (gptr*) &opt_isam, (gptr*) &opt_isam, 0, GET_BOOL, NO_ARG, 0, 0, 0,
+@@ -7644,6 +7692,38 @@
+   case OPT_INNODB_LOG_ARCHIVE:
+     innobase_log_archive= argument ? test(atoi(argument)) : 1;
+     break;
++  case OPT_INNODB_READ_AHEAD:
++    if (argument == disabled_my_option)
++      srv_read_ahead = 0;
++    else if (! argument)
++      srv_read_ahead = 3;
++    else
++    {
++      int type;
++      if ((type=find_type(argument, &innodb_read_ahead_typelib, 2)) <= 0)
++      {
++        fprintf(stderr,"Unknown innodb_read_ahead type: %s\n",argument);
++        exit(1);
++      }
++      srv_read_ahead = (uint) ((type - 1) & 3);
++    }
++    break;
++  case OPT_INNODB_ADAPTIVE_CHECKPOINT:
++    if (argument == disabled_my_option)
++      srv_adaptive_checkpoint = 0;
++    else if (! argument)
++      srv_adaptive_checkpoint = 0;
++    else
++    {
++      int type;
++      if ((type=find_type(argument, &innodb_adaptive_checkpoint_typelib, 2)) <= 0)
++      {
++        fprintf(stderr,"Unknown innodb_adaptive_checkpoint type: %s\n",argument);
++        exit(1);
++      }
++      srv_adaptive_checkpoint = (uint) ((type - 1) % 3);
++    }
++    break;
+ #endif /* HAVE_INNOBASE_DB */
+   case OPT_MYISAM_RECOVER:
+   {
+diff -ruN a/sql/set_var.cc b/sql/set_var.cc
+--- a/sql/set_var.cc	2009-07-02 16:43:23.000000000 +0900
++++ b/sql/set_var.cc	2009-07-02 17:45:29.000000000 +0900
+@@ -489,6 +489,57 @@
+ sys_var_long_ptr  sys_innodb_flush_log_at_trx_commit(
+                                         "innodb_flush_log_at_trx_commit",
+                                         &srv_flush_log_at_trx_commit);
++sys_var_long_ptr	sys_innodb_io_capacity("innodb_io_capacity",
++                                               &srv_io_capacity);
++sys_var_long_ptr	sys_innodb_ibuf_active_contract("innodb_ibuf_active_contract",
++                                                        &srv_ibuf_active_contract);
++sys_var_long_ptr	sys_innodb_ibuf_accel_rate("innodb_ibuf_accel_rate",
++                                                   &srv_ibuf_accel_rate);
++sys_var_long_ptr	sys_innodb_flush_neighbor_pages("innodb_flush_neighbor_pages",
++                                                        &srv_flush_neighbor_pages);
++
++const char *innodb_read_ahead_names[]=
++{
++  "none", /* 0 */
++  "random",
++  "linear",
++  "both", /* 3 */
++  /* For compatibility of the older patch */
++  "0", /* 4 ("none" + 4) */
++  "1",
++  "2",
++  "3", /* 7 ("both" + 4) */
++  NullS
++};
++TYPELIB innodb_read_ahead_typelib=
++{
++  array_elements(innodb_read_ahead_names) - 1, "innodb_read_ahead_typelib",
++  innodb_read_ahead_names, NULL
++};
++sys_var_enum	sys_innodb_read_ahead("innodb_read_ahead", &srv_read_ahead,
++                                      &innodb_read_ahead_typelib, fix_innodb_read_ahead);
++sys_var_long_ptr	sys_innodb_enable_unsafe_group_commit("innodb_enable_unsafe_group_commit",
++                                                             &srv_enable_unsafe_group_commit);
++
++const char *innodb_adaptive_checkpoint_names[]=
++{
++  "none", /* 0 */
++  "reflex", /* 1 */
++  "estimate", /* 2 */
++  /* For compatibility of the older patch */
++  "0", /* 3 ("none" + 3) */
++  "1", /* 4 ("reflex" + 3) */
++  "2", /* 5 ("estimate" + 3) */
++  NullS
++};
++TYPELIB innodb_adaptive_checkpoint_typelib=
++{
++  array_elements(innodb_adaptive_checkpoint_names) - 1, "innodb_adaptive_checkpoint_typelib",
++  innodb_adaptive_checkpoint_names, NULL
++};
++sys_var_enum	sys_innodb_adaptive_checkpoint("innodb_adaptive_checkpoint",
++                           &srv_adaptive_checkpoint,
++                           &innodb_adaptive_checkpoint_typelib, fix_innodb_adaptive_checkpoint);
+ sys_var_const_os_str_ptr sys_innodb_data_file_path("innodb_data_file_path", 
+                                                &innobase_data_file_path);
+ sys_var_const_os_str_ptr sys_innodb_data_home_dir("innodb_data_home_dir", 
+@@ -860,6 +911,13 @@
+   &sys_innodb_thread_concurrency,
+   &sys_innodb_commit_concurrency,
+   &sys_innodb_flush_log_at_trx_commit,
++  &sys_innodb_io_capacity,
++  &sys_innodb_ibuf_active_contract,
++  &sys_innodb_ibuf_accel_rate,
++  &sys_innodb_flush_neighbor_pages,
++  &sys_innodb_read_ahead,
++  &sys_innodb_enable_unsafe_group_commit,
++  &sys_innodb_adaptive_checkpoint,
+ #endif
+   &sys_trust_routine_creators,
+   &sys_trust_function_creators,
+@@ -997,6 +1055,16 @@
+   {sys_innodb_table_locks.name, (char*) &sys_innodb_table_locks, SHOW_SYS},
+   {sys_innodb_thread_concurrency.name, (char*) &sys_innodb_thread_concurrency, SHOW_SYS},
+   {sys_innodb_thread_sleep_delay.name, (char*) &sys_innodb_thread_sleep_delay, SHOW_SYS},
++  {sys_innodb_io_capacity.name, (char*) &sys_innodb_io_capacity, SHOW_SYS},
++  {"innodb_ibuf_max_size", (char*) &srv_ibuf_max_size, SHOW_LONGLONG},
++  {sys_innodb_ibuf_active_contract.name, (char*) &sys_innodb_ibuf_active_contract, SHOW_SYS},
++  {sys_innodb_ibuf_accel_rate.name, (char*) &sys_innodb_ibuf_accel_rate, SHOW_SYS},
++  {sys_innodb_flush_neighbor_pages.name, (char*) &sys_innodb_flush_neighbor_pages, SHOW_SYS},
++  {sys_innodb_read_ahead.name, (char*) &sys_innodb_read_ahead, SHOW_SYS},
++  {sys_innodb_enable_unsafe_group_commit.name, (char*) &sys_innodb_enable_unsafe_group_commit, SHOW_SYS},
++  {sys_innodb_adaptive_checkpoint.name, (char*) &sys_innodb_adaptive_checkpoint, SHOW_SYS},
++  {"innodb_read_io_threads", (char*) &innobase_read_io_threads, SHOW_LONG},
++  {"innodb_write_io_threads", (char*) &innobase_write_io_threads, SHOW_LONG},
+   {sys_innodb_use_legacy_cardinality_algorithm.name,
+    (char*) &sys_innodb_use_legacy_cardinality_algorithm, SHOW_SYS},
+ #endif
+@@ -1459,6 +1527,18 @@
+   }
+ }
+ 
++#ifdef HAVE_INNOBASE_DB
++extern void fix_innodb_read_ahead(THD *thd, enum_var_type type)
++{
++  srv_read_ahead &= 3;
++}
++
++extern void fix_innodb_adaptive_checkpoint(THD *thd, enum_var_type type)
++{
++  srv_adaptive_checkpoint %= 3;
++}
++#endif /* HAVE_INNOBASE_DB */
++
+ static void fix_max_binlog_size(THD *thd, enum_var_type type)
+ {
+   DBUG_ENTER("fix_max_binlog_size");
+diff -ruN a/sql/set_var.h b/sql/set_var.h
+--- a/sql/set_var.h	2009-07-02 16:43:23.000000000 +0900
++++ b/sql/set_var.h	2009-07-02 17:35:17.000000000 +0900
+@@ -31,6 +31,11 @@
+ 
+ extern TYPELIB bool_typelib, delay_key_write_typelib, sql_mode_typelib;
+ 
++#ifdef HAVE_INNOBASE_DB
++extern TYPELIB innodb_read_ahead_typelib;
++extern TYPELIB innodb_adaptive_checkpoint_typelib;
++#endif /* HAVE_INNOBASE_DB */
++
+ typedef int (*sys_check_func)(THD *,  set_var *);
+ typedef bool (*sys_update_func)(THD *, set_var *);
+ typedef void (*sys_after_update_func)(THD *,enum_var_type);
+@@ -1148,6 +1153,10 @@
+ int sql_set_variables(THD *thd, List<set_var_base> *var_list);
+ bool not_all_support_one_shot(List<set_var_base> *var_list);
+ void fix_delay_key_write(THD *thd, enum_var_type type);
++#ifdef HAVE_INNOBASE_DB
++void fix_innodb_read_ahead(THD *thd, enum_var_type type);
++void fix_innodb_adaptive_checkpoint(THD *thd, enum_var_type type);
++#endif /* HAVE_INNOBASE_DB */
+ ulong fix_sql_mode(ulong sql_mode);
+ extern sys_var_const_str sys_charset_system;
+ extern sys_var_str sys_init_connect;