summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'percona/5.0.84-b18-20090811/innodb_io_patches.patch')
-rw-r--r--percona/5.0.84-b18-20090811/innodb_io_patches.patch1379
1 files changed, 1379 insertions, 0 deletions
diff --git a/percona/5.0.84-b18-20090811/innodb_io_patches.patch b/percona/5.0.84-b18-20090811/innodb_io_patches.patch
new file mode 100644
index 0000000..aaef29a
--- /dev/null
+++ b/percona/5.0.84-b18-20090811/innodb_io_patches.patch
@@ -0,0 +1,1379 @@
+diff -ruN a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c
+--- a/innobase/buf/buf0flu.c 2009-05-08 06:12:03.000000000 +0900
++++ b/innobase/buf/buf0flu.c 2009-07-02 16:44:49.000000000 +0900
+@@ -898,10 +898,17 @@
+
+ old_page_count = page_count;
+
++ if (srv_flush_neighbor_pages) {
+ /* Try to flush also all the neighbors */
+ page_count +=
+ buf_flush_try_neighbors(space, offset,
+ flush_type);
++ } else {
++ /* Try to flush the page only */
++ page_count +=
++ buf_flush_try_page(space, offset,
++ flush_type);
++ }
+ /* fprintf(stderr,
+ "Flush type %lu, page no %lu, neighb %lu\n",
+ flush_type, offset,
+diff -ruN a/innobase/buf/buf0rea.c b/innobase/buf/buf0rea.c
+--- a/innobase/buf/buf0rea.c 2009-07-02 16:43:23.000000000 +0900
++++ b/innobase/buf/buf0rea.c 2009-07-02 16:44:49.000000000 +0900
+@@ -20,6 +20,7 @@
+ #include "os0file.h"
+ #include "srv0start.h"
+
++extern uint srv_read_ahead;
+ extern ulint srv_read_ahead_rnd;
+ extern ulint srv_read_ahead_seq;
+ extern ulint srv_buf_pool_reads;
+@@ -189,6 +190,10 @@
+ ulint err;
+ ulint i;
+
++ if (!(srv_read_ahead & 1)) {
++ return(0);
++ }
++
+ if (srv_startup_is_before_trx_rollback_phase) {
+ /* No read-ahead to avoid thread deadlocks */
+ return(0);
+@@ -396,6 +401,10 @@
+ ulint err;
+ ulint i;
+
++ if (!(srv_read_ahead & 2)) {
++ return(0);
++ }
++
+ if (srv_startup_is_before_trx_rollback_phase) {
+ /* No read-ahead to avoid thread deadlocks */
+ return(0);
+diff -ruN a/innobase/ibuf/ibuf0ibuf.c b/innobase/ibuf/ibuf0ibuf.c
+--- a/innobase/ibuf/ibuf0ibuf.c 2009-05-08 06:12:04.000000000 +0900
++++ b/innobase/ibuf/ibuf0ibuf.c 2009-07-02 16:44:49.000000000 +0900
+@@ -370,8 +370,9 @@
+ grow in size, as the references on the upper levels of the tree can
+ change */
+
+- ibuf->max_size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE
+- / IBUF_POOL_SIZE_PER_MAX_SIZE;
++ ibuf->max_size = ut_min( buf_pool_get_curr_size() / UNIV_PAGE_SIZE
++ / IBUF_POOL_SIZE_PER_MAX_SIZE, (ulint) srv_ibuf_max_size / UNIV_PAGE_SIZE);
++ srv_ibuf_max_size = (long long) ibuf->max_size * UNIV_PAGE_SIZE;
+ ibuf->meter = IBUF_THRESHOLD + 1;
+
+ UT_LIST_INIT(ibuf->data_list);
+@@ -2258,11 +2259,13 @@
+
+ mutex_enter(&ibuf_mutex);
+
++ if (!srv_ibuf_active_contract) {
+ if (ibuf->size < ibuf->max_size + IBUF_CONTRACT_ON_INSERT_NON_SYNC) {
+ mutex_exit(&ibuf_mutex);
+
+ return;
+ }
++ }
+
+ sync = FALSE;
+
+diff -ruN a/innobase/include/log0log.h b/innobase/include/log0log.h
+--- a/innobase/include/log0log.h 2009-05-08 06:12:06.000000000 +0900
++++ b/innobase/include/log0log.h 2009-07-02 16:44:49.000000000 +0900
+@@ -169,6 +169,13 @@
+ log_buffer_flush_to_disk(void);
+ /*==========================*/
+ /********************************************************************
++Flushes the log buffer. Forces it to disk depending on the value of
++the configuration parameter innodb_flush_log_at_trx_commit. */
++
++void
++log_buffer_flush_maybe_sync(void);
++/*=============================*/
++/********************************************************************
+ Advances the smallest lsn for which there are unflushed dirty blocks in the
+ buffer pool and also may make a new checkpoint. NOTE: this function may only
+ be called if the calling thread owns no synchronization objects! */
+diff -ruN a/innobase/include/os0file.h b/innobase/include/os0file.h
+--- a/innobase/include/os0file.h 2009-07-02 16:43:23.000000000 +0900
++++ b/innobase/include/os0file.h 2009-07-02 16:44:49.000000000 +0900
+@@ -551,8 +551,10 @@
+ /*========*/
+ ulint n, /* in: maximum number of pending aio operations
+ allowed; n must be divisible by n_segments */
+- ulint n_segments, /* in: combined number of segments in the four
+- first aio arrays; must be >= 4 */
++// ulint n_segments, /* in: combined number of segments in the four
++// first aio arrays; must be >= 4 */
++ ulint n_read_threads, /* n_segments == 2 + n_read_threads + n_write_threads */
++ ulint n_write_threads, /**/
+ ulint n_slots_sync); /* in: number of slots in the sync aio array */
+ /***********************************************************************
+ Requests an asynchronous i/o operation. */
+diff -ruN a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h
+--- a/innobase/include/srv0srv.h 2009-07-02 16:43:23.000000000 +0900
++++ b/innobase/include/srv0srv.h 2009-07-02 18:02:38.000000000 +0900
+@@ -89,6 +89,8 @@
+ extern ulint srv_lock_table_size;
+
+ extern ulint srv_n_file_io_threads;
++extern ulint srv_n_read_io_threads;
++extern ulint srv_n_write_io_threads;
+
+ #ifdef UNIV_LOG_ARCHIVE
+ extern ibool srv_log_archive_on;
+@@ -133,6 +135,15 @@
+ extern ulong srv_max_purge_lag;
+ extern ibool srv_use_awe;
+ extern ibool srv_use_adaptive_hash_indexes;
++
++extern ulint srv_io_capacity;
++extern long long srv_ibuf_max_size;
++extern ulint srv_ibuf_active_contract;
++extern ulint srv_ibuf_accel_rate;
++extern ulint srv_flush_neighbor_pages;
++extern ulint srv_enable_unsafe_group_commit;
++extern uint srv_read_ahead;
++extern uint srv_adaptive_checkpoint;
+ /*-------------------------------------------*/
+
+ extern ulint srv_n_rows_inserted;
+diff -ruN a/innobase/log/log0log.c b/innobase/log/log0log.c
+--- a/innobase/log/log0log.c 2009-05-08 06:12:10.000000000 +0900
++++ b/innobase/log/log0log.c 2009-07-02 16:44:49.000000000 +0900
+@@ -1524,6 +1524,29 @@
+ }
+
+ /********************************************************************
++Flush the log buffer. Force it to disk depending on the value of
++innodb_flush_log_at_trx_commit. */
++
++void
++log_buffer_flush_maybe_sync(void)
++/*=============================*/
++{
++ dulint lsn;
++
++ mutex_enter(&(log_sys->mutex));
++
++ lsn = log_sys->lsn;
++
++ mutex_exit(&(log_sys->mutex));
++
++ /* Force log buffer to disk when innodb_flush_log_at_trx_commit = 1. */
++ log_write_up_to(lsn, LOG_WAIT_ALL_GROUPS,
++ srv_flush_log_at_trx_commit == 1 ? TRUE : FALSE,
++ srv_flush_log_at_trx_commit == 1 ?
++ LOG_WRITE_FROM_BACKGROUND_SYNC :
++ LOG_WRITE_FROM_BACKGROUND_ASYNC);
++}
++/********************************************************************
+ Tries to establish a big enough margin of free space in the log buffer, such
+ that a new log entry can be catenated without an immediate need for a flush. */
+ static
+@@ -3326,6 +3349,15 @@
+ (ulong) ut_dulint_get_high(log_sys->last_checkpoint_lsn),
+ (ulong) ut_dulint_get_low(log_sys->last_checkpoint_lsn));
+
++ fprintf(file,
++ "Max checkpoint age %lu\n"
++ "Modified age %lu\n"
++ "Checkpoint age %lu\n",
++ (ulong) log_sys->max_checkpoint_age,
++ (ulong) ut_dulint_minus(log_sys->lsn,
++ log_buf_pool_get_oldest_modification()),
++ (ulong) ut_dulint_minus(log_sys->lsn, log_sys->last_checkpoint_lsn));
++
+ current_time = time(NULL);
+
+ time_elapsed = 0.001 + difftime(current_time,
+diff -ruN a/innobase/os/os0file.c b/innobase/os/os0file.c
+--- a/innobase/os/os0file.c 2009-07-02 16:43:23.000000000 +0900
++++ b/innobase/os/os0file.c 2009-07-02 16:44:49.000000000 +0900
+@@ -66,6 +66,28 @@
+
+ ibool os_aio_print_debug = FALSE;
+
++/* State for the state of an IO request in simulated AIO.
++ Protocol for simulated aio:
++ client requests IO: find slot with reserved = FALSE. Add entry with
++ status = OS_AIO_NOT_ISSUED.
++ IO thread wakes: find adjacent slots with reserved = TRUE and status =
++ OS_AIO_NOT_ISSUED. Change status for slots to
++ OS_AIO_ISSUED.
++ IO operation completes: set status for slots to OS_AIO_DONE. set status
++ for the first slot to OS_AIO_CLAIMED and return
++ result for that slot.
++ When there are multiple read and write threads, they all compete to execute
++ the requests in the array (os_aio_array_t). This avoids the need to load
++ balance requests at the time the request is made at the cost of waking all
++ threads when a request is available.
++*/
++typedef enum {
++ OS_AIO_NOT_ISSUED, /* Available to be processed by an IO thread. */
++ OS_AIO_ISSUED, /* Being processed by an IO thread. */
++ OS_AIO_DONE, /* Request processed. */
++ OS_AIO_CLAIMED /* Result being returned to client. */
++} os_aio_status;
++
+ /* The aio array slot structure */
+ typedef struct os_aio_slot_struct os_aio_slot_t;
+
+@@ -74,6 +96,8 @@
+ ulint pos; /* index of the slot in the aio
+ array */
+ ibool reserved; /* TRUE if this slot is reserved */
++ os_aio_status status; /* Status for current request. Valid when reserved
++ is TRUE. Used only in simulated aio. */
+ time_t reservation_time;/* time when reserved */
+ ulint len; /* length of the block to read or
+ write */
+@@ -84,11 +108,11 @@
+ ulint offset_high; /* 32 high bits of file offset */
+ os_file_t file; /* file where to read or write */
+ const char* name; /* file name or path */
+- ibool io_already_done;/* used only in simulated aio:
+- TRUE if the physical i/o already
+- made and only the slot message
+- needs to be passed to the caller
+- of os_aio_simulated_handle */
++// ibool io_already_done;/* used only in simulated aio:
++// TRUE if the physical i/o already
++// made and only the slot message
++// needs to be passed to the caller
++// of os_aio_simulated_handle */
+ fil_node_t* message1; /* message which is given by the */
+ void* message2; /* the requester of an aio operation
+ and which can be used to identify
+@@ -137,6 +161,13 @@
+ /* Array of events used in simulated aio */
+ os_event_t* os_aio_segment_wait_events = NULL;
+
++/* Number for the first global segment for reading. */
++const ulint os_aio_first_read_segment = 2;
++
++/* Number for the first global segment for writing. Set to
++2 + os_aio_read_write_threads. */
++ulint os_aio_first_write_segment = 0;
++
+ /* The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
+ are NULL when the module has not yet been initialized. */
+ static os_aio_array_t* os_aio_read_array = NULL;
+@@ -145,11 +176,17 @@
+ static os_aio_array_t* os_aio_log_array = NULL;
+ static os_aio_array_t* os_aio_sync_array = NULL;
+
++/* Per thread buffer used for merged IO requests. Used by
++os_aio_simulated_handle so that a buffer doesn't have to be allocated
++for each request. */
++static char* os_aio_thread_buffer[SRV_MAX_N_IO_THREADS];
++static ulint os_aio_thread_buffer_size[SRV_MAX_N_IO_THREADS];
++
+ static ulint os_aio_n_segments = ULINT_UNDEFINED;
+
+ /* If the following is TRUE, read i/o handler threads try to
+ wait until a batch of new read requests have been posted */
+-static ibool os_aio_recommend_sleep_for_read_threads = FALSE;
++static volatile ibool os_aio_recommend_sleep_for_read_threads = FALSE;
+
+ ulint os_n_file_reads = 0;
+ ulint os_bytes_read_since_printout = 0;
+@@ -2878,8 +2915,10 @@
+ /*========*/
+ ulint n, /* in: maximum number of pending aio operations
+ allowed; n must be divisible by n_segments */
+- ulint n_segments, /* in: combined number of segments in the four
+- first aio arrays; must be >= 4 */
++// ulint n_segments, /* in: combined number of segments in the four
++// first aio arrays; must be >= 4 */
++ ulint n_read_threads, /* n_segments == 2 + n_read_threads + n_write_threads*/
++ ulint n_write_threads, /**/
+ ulint n_slots_sync) /* in: number of slots in the sync aio array */
+ {
+ ulint n_read_segs;
+@@ -2889,6 +2928,8 @@
+ #ifdef POSIX_ASYNC_IO
+ sigset_t sigset;
+ #endif
++ ulint n_segments = 2 + n_read_threads + n_write_threads;
++
+ ut_ad(n % n_segments == 0);
+ ut_ad(n_segments >= 4);
+
+@@ -2896,14 +2937,17 @@
+
+ for (i = 0; i < n_segments; i++) {
+ srv_set_io_thread_op_info(i, "not started yet");
++ os_aio_thread_buffer[i] = 0;
++ os_aio_thread_buffer_size[i] = 0;
+ }
+
+ n_per_seg = n / n_segments;
+- n_write_segs = (n_segments - 2) / 2;
+- n_read_segs = n_segments - 2 - n_write_segs;
++ n_write_segs = n_write_threads;
++ n_read_segs = n_read_threads;
+
+ /* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */
+
++ os_aio_first_write_segment = os_aio_first_read_segment + n_read_threads;
+ os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
+
+ srv_io_thread_function[0] = "insert buffer thread";
+@@ -2912,14 +2956,14 @@
+
+ srv_io_thread_function[1] = "log thread";
+
+- os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg,
++ os_aio_read_array = os_aio_array_create(n_per_seg,
+ n_read_segs);
+ for (i = 2; i < 2 + n_read_segs; i++) {
+ ut_a(i < SRV_MAX_N_IO_THREADS);
+ srv_io_thread_function[i] = "read thread";
+ }
+
+- os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg,
++ os_aio_write_array = os_aio_array_create(n_per_seg,
+ n_write_segs);
+ for (i = 2 + n_read_segs; i < n_segments; i++) {
+ ut_a(i < SRV_MAX_N_IO_THREADS);
+@@ -3181,6 +3225,13 @@
+ struct aiocb* control;
+ #endif
+ ulint i;
++ ulint prim_segment;
++ ulint n;
++
++ n = array->n_slots / array->n_segments;
++ /* 64 blocks' striping ( aligning max(BUF_READ_AHEAD_AREA) ) */
++ prim_segment = ( offset >> (UNIV_PAGE_SIZE_SHIFT + 6) ) % (array->n_segments);
++
+ loop:
+ os_mutex_enter(array->mutex);
+
+@@ -3199,6 +3250,16 @@
+ goto loop;
+ }
+
++ for (i = prim_segment * n; i < array->n_slots; i++) {
++ slot = os_aio_array_get_nth_slot(array, i);
++
++ if (slot->reserved == FALSE) {
++ break;
++ }
++ }
++
++ if (slot->reserved == TRUE){
++ /* Not found after the intended segment. So we should search before. */
+ for (i = 0;; i++) {
+ slot = os_aio_array_get_nth_slot(array, i);
+
+@@ -3206,6 +3267,7 @@
+ break;
+ }
+ }
++ }
+
+ array->n_reserved++;
+
+@@ -3228,7 +3290,8 @@
+ slot->buf = buf;
+ slot->offset = offset;
+ slot->offset_high = offset_high;
+- slot->io_already_done = FALSE;
++// slot->io_already_done = FALSE;
++ slot->status = OS_AIO_NOT_ISSUED;
+
+ #ifdef WIN_ASYNC_IO
+ control = &(slot->control);
+@@ -3281,6 +3344,7 @@
+ ut_ad(slot->reserved);
+
+ slot->reserved = FALSE;
++ slot->status = OS_AIO_NOT_ISSUED;
+
+ array->n_reserved--;
+
+@@ -3317,16 +3381,18 @@
+
+ segment = os_aio_get_array_and_local_segment(&array, global_segment);
+
+- n = array->n_slots / array->n_segments;
++ n = array->n_slots;
+
+ /* Look through n slots after the segment * n'th slot */
+
+ os_mutex_enter(array->mutex);
+
+ for (i = 0; i < n; i++) {
+- slot = os_aio_array_get_nth_slot(array, i + segment * n);
++ slot = os_aio_array_get_nth_slot(array, i);
+
+- if (slot->reserved) {
++ if (slot->reserved &&
++ (slot->status == OS_AIO_NOT_ISSUED ||
++ slot->status == OS_AIO_DONE)) {
+ /* Found an i/o request */
+
+ break;
+@@ -3336,7 +3402,25 @@
+ os_mutex_exit(array->mutex);
+
+ if (i < n) {
+- os_event_set(os_aio_segment_wait_events[global_segment]);
++ if (array == os_aio_ibuf_array) {
++ os_event_set(os_aio_segment_wait_events[0]);
++
++ } else if (array == os_aio_log_array) {
++ os_event_set(os_aio_segment_wait_events[1]);
++
++ } else if (array == os_aio_read_array) {
++ ulint x;
++ for (x = os_aio_first_read_segment; x < os_aio_first_write_segment; x++)
++ os_event_set(os_aio_segment_wait_events[x]);
++
++ } else if (array == os_aio_write_array) {
++ ulint x;
++ for (x = os_aio_first_write_segment; x < os_aio_n_segments; x++)
++ os_event_set(os_aio_segment_wait_events[x]);
++
++ } else {
++ ut_a(0);
++ }
+ }
+ }
+
+@@ -3347,8 +3431,6 @@
+ os_aio_simulated_wake_handler_threads(void)
+ /*=======================================*/
+ {
+- ulint i;
+-
+ if (os_aio_use_native_aio) {
+ /* We do not use simulated aio: do nothing */
+
+@@ -3357,9 +3439,10 @@
+
+ os_aio_recommend_sleep_for_read_threads = FALSE;
+
+- for (i = 0; i < os_aio_n_segments; i++) {
+- os_aio_simulated_wake_handler_thread(i);
+- }
++ os_aio_simulated_wake_handler_thread(0);
++ os_aio_simulated_wake_handler_thread(1);
++ os_aio_simulated_wake_handler_thread(os_aio_first_read_segment);
++ os_aio_simulated_wake_handler_thread(os_aio_first_write_segment);
+ }
+
+ /**************************************************************************
+@@ -3640,7 +3723,7 @@
+ ut_ad(os_aio_validate());
+ ut_ad(segment < array->n_segments);
+
+- n = array->n_slots / array->n_segments;
++ n = array->n_slots;
+
+ if (array == os_aio_sync_array) {
+ os_event_wait(os_aio_array_get_nth_slot(array, pos)->event);
+@@ -3648,12 +3731,12 @@
+ } else {
+ srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
+ i = os_event_wait_multiple(n,
+- (array->native_events) + segment * n);
++ (array->native_events));
+ }
+
+ os_mutex_enter(array->mutex);
+
+- slot = os_aio_array_get_nth_slot(array, i + segment * n);
++ slot = os_aio_array_get_nth_slot(array, i);
+
+ ut_a(slot->reserved);
+
+@@ -3830,10 +3913,13 @@
+ os_aio_slot_t* slot;
+ os_aio_slot_t* slot2;
+ os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
++ os_aio_slot_t* lowest_request;
++ os_aio_slot_t* oldest_request;
+ ulint n_consecutive;
+ ulint total_len;
+ ulint offs;
+ ulint lowest_offset;
++ ulint oldest_offset;
+ ulint biggest_age;
+ ulint age;
+ byte* combined_buf;
+@@ -3841,6 +3927,7 @@
+ ibool ret;
+ ulint n;
+ ulint i;
++ time_t now;
+
+ segment = os_aio_get_array_and_local_segment(&array, global_segment);
+
+@@ -3853,7 +3940,7 @@
+ ut_ad(os_aio_validate());
+ ut_ad(segment < array->n_segments);
+
+- n = array->n_slots / array->n_segments;
++ n = array->n_slots;
+
+ /* Look through n slots after the segment * n'th slot */
+
+@@ -3875,9 +3962,9 @@
+ done */
+
+ for (i = 0; i < n; i++) {
+- slot = os_aio_array_get_nth_slot(array, i + segment * n);
++ slot = os_aio_array_get_nth_slot(array, i);
+
+- if (slot->reserved && slot->io_already_done) {
++ if (slot->reserved && slot->status == OS_AIO_DONE) {
+
+ if (os_aio_print_debug) {
+ fprintf(stderr,
+@@ -3897,67 +3984,57 @@
+ then pick the one at the lowest offset. */
+
+ biggest_age = 0;
+- lowest_offset = ULINT_MAX;
++ now = time(NULL);
++ oldest_request = lowest_request = NULL;
++ oldest_offset = lowest_offset = ULINT_MAX;
+
++ /* Find the oldest request and the request with the smallest offset */
+ for (i = 0; i < n; i++) {
+- slot = os_aio_array_get_nth_slot(array, i + segment * n);
++ slot = os_aio_array_get_nth_slot(array, i);
+
+- if (slot->reserved) {
+- age = (ulint)difftime(time(NULL),
+- slot->reservation_time);
++ if (slot->reserved && slot->status == OS_AIO_NOT_ISSUED) {
++ age = (ulint)difftime(now, slot->reservation_time);
+
+ if ((age >= 2 && age > biggest_age)
+ || (age >= 2 && age == biggest_age
+- && slot->offset < lowest_offset)) {
++ && slot->offset < oldest_offset)) {
+
+ /* Found an i/o request */
+- consecutive_ios[0] = slot;
+-
+- n_consecutive = 1;
+-
+ biggest_age = age;
+- lowest_offset = slot->offset;
++ oldest_request = slot;
++ oldest_offset = slot->offset;
+ }
+- }
+- }
+-
+- if (n_consecutive == 0) {
+- /* There were no old requests. Look for an i/o request at the
+- lowest offset in the array (we ignore the high 32 bits of the
+- offset in these heuristics) */
+-
+- lowest_offset = ULINT_MAX;
+-
+- for (i = 0; i < n; i++) {
+- slot = os_aio_array_get_nth_slot(array,
+- i + segment * n);
+-
+- if (slot->reserved && slot->offset < lowest_offset) {
+
++ /* Look for an i/o request at the lowest offset in the array
++ * (we ignore the high 32 bits of the offset) */
++ if (slot->offset < lowest_offset) {
+ /* Found an i/o request */
+- consecutive_ios[0] = slot;
+-
+- n_consecutive = 1;
+-
++ lowest_request = slot;
+ lowest_offset = slot->offset;
+ }
+ }
+ }
+
+- if (n_consecutive == 0) {
++ if (!lowest_request && !oldest_request) {
+
+ /* No i/o requested at the moment */
+
+ goto wait_for_io;
+ }
+
+- slot = consecutive_ios[0];
++ if (oldest_request) {
++ slot = oldest_request;
++ } else {
++ slot = lowest_request;
++ }
++ consecutive_ios[0] = slot;
++ n_consecutive = 1;
+
+ /* Check if there are several consecutive blocks to read or write */
+
+ consecutive_loop:
+ for (i = 0; i < n; i++) {
+- slot2 = os_aio_array_get_nth_slot(array, i + segment * n);
++ slot2 = os_aio_array_get_nth_slot(array, i);
+
+ if (slot2->reserved && slot2 != slot
+ && slot2->offset == slot->offset + slot->len
+@@ -3965,7 +4042,8 @@
+ sum does not wrap over */
+ && slot2->offset_high == slot->offset_high
+ && slot2->type == slot->type
+- && slot2->file == slot->file) {
++ && slot2->file == slot->file
++ && slot2->status == OS_AIO_NOT_ISSUED) {
+
+ /* Found a consecutive i/o request */
+
+@@ -3994,6 +4072,8 @@
+
+ for (i = 0; i < n_consecutive; i++) {
+ total_len += consecutive_ios[i]->len;
++ ut_a(consecutive_ios[i]->status == OS_AIO_NOT_ISSUED);
++ consecutive_ios[i]->status = OS_AIO_ISSUED;
+ }
+
+ if (n_consecutive == 1) {
+@@ -4001,7 +4081,14 @@
+ combined_buf = slot->buf;
+ combined_buf2 = NULL;
+ } else {
+- combined_buf2 = ut_malloc(total_len + UNIV_PAGE_SIZE);
++ if ((total_len + UNIV_PAGE_SIZE) > os_aio_thread_buffer_size[global_segment]) {
++ if (os_aio_thread_buffer[global_segment])
++ ut_free(os_aio_thread_buffer[global_segment]);
++
++ os_aio_thread_buffer[global_segment] = ut_malloc(total_len + UNIV_PAGE_SIZE);
++ os_aio_thread_buffer_size[global_segment] = total_len + UNIV_PAGE_SIZE;
++ }
++ combined_buf2 = os_aio_thread_buffer[global_segment];
+
+ ut_a(combined_buf2);
+
+@@ -4012,6 +4099,9 @@
+ this assumes that there is just one i/o-handler thread serving
+ a single segment of slots! */
+
++ ut_a(slot->reserved);
++ ut_a(slot->status == OS_AIO_ISSUED);
++
+ os_mutex_exit(array->mutex);
+
+ if (slot->type == OS_FILE_WRITE && n_consecutive > 1) {
+@@ -4081,16 +4171,13 @@
+ }
+ }
+
+- if (combined_buf2) {
+- ut_free(combined_buf2);
+- }
+-
+ os_mutex_enter(array->mutex);
+
+ /* Mark the i/os done in slots */
+
+ for (i = 0; i < n_consecutive; i++) {
+- consecutive_ios[i]->io_already_done = TRUE;
++ ut_a(consecutive_ios[i]->status == OS_AIO_ISSUED);
++ consecutive_ios[i]->status = OS_AIO_DONE;
+ }
+
+ /* We return the messages for the first slot now, and if there were
+@@ -4100,6 +4187,8 @@
+ slot_io_done:
+
+ ut_a(slot->reserved);
++ ut_a(slot->status == OS_AIO_DONE);
++ slot->status = OS_AIO_CLAIMED;
+
+ *message1 = slot->message1;
+ *message2 = slot->message2;
+diff -ruN a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c
+--- a/innobase/srv/srv0srv.c 2009-07-02 16:43:23.000000000 +0900
++++ b/innobase/srv/srv0srv.c 2009-07-02 18:36:54.000000000 +0900
+@@ -167,6 +167,8 @@
+ ulint srv_lock_table_size = ULINT_MAX;
+
+ ulint srv_n_file_io_threads = ULINT_MAX;
++ulint srv_n_read_io_threads = 1;
++ulint srv_n_write_io_threads = 1;
+
+ #ifdef UNIV_LOG_ARCHIVE
+ ibool srv_log_archive_on = FALSE;
+@@ -330,6 +332,24 @@
+ ibool srv_use_awe = FALSE;
+ ibool srv_use_adaptive_hash_indexes = TRUE;
+
++ulint srv_io_capacity = 100;
++
++/* Returns the number of IO operations that is X percent of the capacity.
++PCT_IO(5) -> returns the number of IO operations that is 5% of the max
++where max is srv_io_capacity. */
++#define PCT_IO(pct) ((ulint) (srv_io_capacity * ((double) pct / 100.0)))
++
++long long srv_ibuf_max_size = 0;
++ulint srv_ibuf_active_contract = 0; /* 0:disable 1:enable */
++ulint srv_ibuf_accel_rate = 100;
++#define PCT_IBUF_IO(pct) ((ulint) (srv_io_capacity * srv_ibuf_accel_rate * ((double) pct / 10000.0)))
++
++ulint srv_flush_neighbor_pages = 1; /* 0:disable 1:enable */
++
++ulint srv_enable_unsafe_group_commit = 0; /* 0:disable 1:enable */
++
++uint srv_read_ahead = 3; /* 1: random 2: linear 3: Both */
++uint srv_adaptive_checkpoint = 0; /* 0: none 1: reflex 2: estimate */
+ /*-------------------------------------------*/
+ ulong srv_n_spin_wait_rounds = 20;
+ ulong srv_n_free_tickets_to_enter = 500;
+@@ -2228,6 +2248,10 @@
+ ulint n_pend_ios;
+ ibool skip_sleep = FALSE;
+ ulint i;
++
++ dulint lsn_old;
++
++ dulint oldest_lsn;
+
+ #ifdef UNIV_DEBUG_THREAD_CREATION
+ fprintf(stderr, "Master thread starts, id %lu\n",
+@@ -2244,6 +2268,9 @@
+
+ mutex_exit(&kernel_mutex);
+
++ mutex_enter(&(log_sys->mutex));
++ lsn_old = log_sys->lsn;
++ mutex_exit(&(log_sys->mutex));
+ os_event_set(srv_sys->operational);
+ loop:
+ /*****************************************************************/
+@@ -2279,6 +2306,18 @@
+ if (!skip_sleep) {
+
+ os_thread_sleep(1000000);
++ /*
++ mutex_enter(&(log_sys->mutex));
++ oldest_lsn = buf_pool_get_oldest_modification();
++ dulint lsn = log_sys->lsn;
++ mutex_exit(&(log_sys->mutex));
++
++ if (!ut_dulint_is_zero(oldest_lsn))
++ fprintf(stderr,
++ "InnoDB flush: age pct: %lu, lsn progress: %lu\n",
++ ut_dulint_minus(lsn, oldest_lsn) * 100 / log_sys->max_checkpoint_age,
++ ut_dulint_minus(lsn, lsn_old));
++ */
+ }
+
+ skip_sleep = FALSE;
+@@ -2317,13 +2356,14 @@
+ + log_sys->n_pending_writes;
+ n_ios = log_sys->n_log_ios + buf_pool->n_pages_read
+ + buf_pool->n_pages_written;
+- if (n_pend_ios < 3 && (n_ios - n_ios_old < 5)) {
++ if (n_pend_ios < PCT_IO(3) && (n_ios - n_ios_old < PCT_IO(5))) {
+ srv_main_thread_op_info = "doing insert buffer merge";
+- ibuf_contract_for_n_pages(TRUE, 5);
++ ibuf_contract_for_n_pages(TRUE, PCT_IBUF_IO(5));
+
+ srv_main_thread_op_info = "flushing log";
+
+- log_buffer_flush_to_disk();
++ /* No fsync when srv_flush_log_at_trx_commit != 1 */
++ log_buffer_flush_maybe_sync();
+ }
+
+ if (buf_get_modified_ratio_pct() >
+@@ -2332,7 +2372,7 @@
+ /* Try to keep the number of modified pages in the
+ buffer pool under the limit wished by the user */
+
+- n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100,
++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100),
+ ut_dulint_max);
+
+ /* If we had to do the flush, it may have taken
+@@ -2341,6 +2381,140 @@
+ iteration of this loop. */
+
+ skip_sleep = TRUE;
++ mutex_enter(&(log_sys->mutex));
++ lsn_old = log_sys->lsn;
++ mutex_exit(&(log_sys->mutex));
++ } else if (srv_adaptive_checkpoint == 1) {
++
++ /* Try to keep modified age not to exceed
++ max_checkpoint_age * 7/8 line */
++
++ mutex_enter(&(log_sys->mutex));
++ lsn_old = log_sys->lsn;
++ oldest_lsn = buf_pool_get_oldest_modification();
++ if (ut_dulint_is_zero(oldest_lsn)) {
++
++ mutex_exit(&(log_sys->mutex));
++
++ } else {
++ if (ut_dulint_minus(log_sys->lsn, oldest_lsn)
++ > (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 8)) {
++ /* LOG_POOL_PREFLUSH_RATIO_ASYNC is exceeded. */
++ /* We should not flush from here. */
++ mutex_exit(&(log_sys->mutex));
++ } else if (ut_dulint_minus(log_sys->lsn, oldest_lsn)
++ > (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 4)) {
++
++ /* 2nd defence line (max_checkpoint_age * 3/4) */
++
++ mutex_exit(&(log_sys->mutex));
++
++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100),
++ ut_dulint_max);
++ skip_sleep = TRUE;
++ } else if (ut_dulint_minus(log_sys->lsn, oldest_lsn)
++ > (log_sys->max_checkpoint_age)/2 ) {
++
++ /* 1st defence line (max_checkpoint_age * 1/2) */
++
++ mutex_exit(&(log_sys->mutex));
++
++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(10),
++ ut_dulint_max);
++ skip_sleep = TRUE;
++ } else {
++ mutex_exit(&(log_sys->mutex));
++ }
++ }
++ } else if (srv_adaptive_checkpoint == 2) {
++
++ /* Try to keep modified age not to exceed
++ max_checkpoint_age * 7/8 line */
++
++ mutex_enter(&(log_sys->mutex));
++
++ oldest_lsn = buf_pool_get_oldest_modification();
++ if (ut_dulint_is_zero(oldest_lsn)) {
++ lsn_old = log_sys->lsn;
++ mutex_exit(&(log_sys->mutex));
++
++ } else {
++ if (ut_dulint_minus(log_sys->lsn, oldest_lsn)
++ > (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 8)) {
++ /* LOG_POOL_PREFLUSH_RATIO_ASYNC is exceeded. */
++ /* We should not flush from here. */
++ lsn_old = log_sys->lsn;
++ mutex_exit(&(log_sys->mutex));
++ } else if (ut_dulint_minus(log_sys->lsn, oldest_lsn)
++ > (log_sys->max_checkpoint_age)/2 ) {
++
++ /* defence line (max_checkpoint_age * 1/2) */
++ dulint lsn = log_sys->lsn;
++
++ mutex_exit(&(log_sys->mutex));
++
++ ib_longlong level, bpl;
++ buf_block_t* bpage;
++
++ mutex_enter(&buf_pool->mutex);
++
++ level = 0;
++ bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
++
++ while (bpage != NULL) {
++ dulint oldest_modification = bpage->oldest_modification;
++ if (!ut_dulint_is_zero(oldest_modification)) {
++ level += log_sys->max_checkpoint_age
++ - ut_dulint_minus(lsn, oldest_modification);
++ }
++ bpage = UT_LIST_GET_NEXT(flush_list, bpage);
++ }
++
++ if (level) {
++ bpl = ((ib_longlong) UT_LIST_GET_LEN(buf_pool->flush_list)
++ * UT_LIST_GET_LEN(buf_pool->flush_list)
++ * ut_dulint_minus(lsn, lsn_old)) / level;
++ } else {
++ bpl = 0;
++ }
++
++ mutex_exit(&buf_pool->mutex);
++
++ if (!srv_use_doublewrite_buf) {
++ /* flush is faster than when doublewrite */
++ bpl = (bpl * 3) / 4;
++ }
++
++ if(bpl) {
++retry_flush_batch:
++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST,
++ bpl,
++ ut_dulint_add(oldest_lsn,
++ ut_dulint_minus(lsn,
++ lsn_old)));
++ if (n_pages_flushed == ULINT_UNDEFINED) {
++ os_thread_sleep(5000);
++ goto retry_flush_batch;
++ }
++ }
++
++ lsn_old = lsn;
++ /*
++ fprintf(stderr,
++ "InnoDB flush: age pct: %lu, lsn progress: %lu, blocks to flush:%llu\n",
++ ut_dulint_minus(lsn, oldest_lsn) * 100 / log_sys->max_checkpoint_age,
++ ut_dulint_minus(lsn, lsn_old), bpl);
++ */
++ } else {
++ lsn_old = log_sys->lsn;
++ mutex_exit(&(log_sys->mutex));
++ }
++ }
++
++ } else {
++ mutex_enter(&(log_sys->mutex));
++ lsn_old = log_sys->lsn;
++ mutex_exit(&(log_sys->mutex));
+ }
+
+ if (srv_activity_count == old_activity_count) {
+@@ -2367,23 +2541,25 @@
+ n_pend_ios = buf_get_n_pending_ios() + log_sys->n_pending_writes;
+ n_ios = log_sys->n_log_ios + buf_pool->n_pages_read
+ + buf_pool->n_pages_written;
+- if (n_pend_ios < 3 && (n_ios - n_ios_very_old < 200)) {
++ if (n_pend_ios < 3 && (n_ios - n_ios_very_old < PCT_IO(200))) {
+
+ srv_main_thread_op_info = "flushing buffer pool pages";
+- buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max);
++ buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), ut_dulint_max);
+
+ srv_main_thread_op_info = "flushing log";
+- log_buffer_flush_to_disk();
++ /* No fsync when srv_flush_log_at_trx_commit != 1 */
++ log_buffer_flush_maybe_sync();
+ }
+
+ /* We run a batch of insert buffer merge every 10 seconds,
+ even if the server were active */
+
+ srv_main_thread_op_info = "doing insert buffer merge";
+- ibuf_contract_for_n_pages(TRUE, 5);
++ ibuf_contract_for_n_pages(TRUE, PCT_IBUF_IO(5));
+
+ srv_main_thread_op_info = "flushing log";
+- log_buffer_flush_to_disk();
++ /* No fsync when srv_flush_log_at_trx_commit != 1 */
++ log_buffer_flush_maybe_sync();
+
+ /* We run a full purge every 10 seconds, even if the server
+ were active */
+@@ -2422,14 +2598,14 @@
+ (> 70 %), we assume we can afford reserving the disk(s) for
+ the time it requires to flush 100 pages */
+
+- n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100,
++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100),
+ ut_dulint_max);
+ } else {
+ /* Otherwise, we only flush a small number of pages so that
+ we do not unnecessarily use much disk i/o capacity from
+ other work */
+
+- n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 10,
++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(10),
+ ut_dulint_max);
+ }
+
+@@ -2518,7 +2694,7 @@
+ if (srv_fast_shutdown && srv_shutdown_state > 0) {
+ n_bytes_merged = 0;
+ } else {
+- n_bytes_merged = ibuf_contract_for_n_pages(TRUE, 20);
++ n_bytes_merged = ibuf_contract_for_n_pages(TRUE, PCT_IBUF_IO(100));
+ }
+
+ srv_main_thread_op_info = "reserving kernel mutex";
+@@ -2535,7 +2711,7 @@
+
+ if (srv_fast_shutdown < 2) {
+ n_pages_flushed =
+- buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max);
++ buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), ut_dulint_max);
+ } else {
+ /* In the fastest shutdown we do not flush the buffer pool
+ to data files: we set n_pages_flushed to 0 artificially. */
+@@ -2557,7 +2733,14 @@
+
+ srv_main_thread_op_info = "flushing log";
+
+- log_buffer_flush_to_disk();
++ current_time = time(NULL);
++ if (difftime(current_time, last_flush_time) > 1) {
++ log_buffer_flush_to_disk();
++ last_flush_time = current_time;
++ } else {
++ /* No fsync when srv_flush_log_at_trx_commit != 1 */
++ log_buffer_flush_maybe_sync();
++ }
+
+ srv_main_thread_op_info = "making checkpoint";
+
+diff -ruN a/innobase/srv/srv0start.c b/innobase/srv/srv0start.c
+--- a/innobase/srv/srv0start.c 2009-05-08 06:12:12.000000000 +0900
++++ b/innobase/srv/srv0start.c 2009-07-02 16:44:49.000000000 +0900
+@@ -1205,24 +1205,28 @@
+ return(DB_ERROR);
+ }
+
++ /* over write innodb_file_io_threads */
++ srv_n_file_io_threads = 2 + srv_n_read_io_threads + srv_n_write_io_threads;
++
+ /* Restrict the maximum number of file i/o threads */
+ if (srv_n_file_io_threads > SRV_MAX_N_IO_THREADS) {
+
+ srv_n_file_io_threads = SRV_MAX_N_IO_THREADS;
++ srv_n_read_io_threads = srv_n_write_io_threads = (SRV_MAX_N_IO_THREADS - 2) / 2;
+ }
+
+ if (!os_aio_use_native_aio) {
+ /* In simulated aio we currently have use only for 4 threads */
+- srv_n_file_io_threads = 4;
++ /*srv_n_file_io_threads = 4;*/
+
+ os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD
+ * srv_n_file_io_threads,
+- srv_n_file_io_threads,
++ srv_n_read_io_threads, srv_n_write_io_threads,
+ SRV_MAX_N_PENDING_SYNC_IOS);
+ } else {
+ os_aio_init(SRV_N_PENDING_IOS_PER_THREAD
+ * srv_n_file_io_threads,
+- srv_n_file_io_threads,
++ srv_n_read_io_threads, srv_n_write_io_threads,
+ SRV_MAX_N_PENDING_SYNC_IOS);
+ }
+
+diff -ruN a/patch_info/innodb_io_patches.info b/patch_info/innodb_io_patches.info
+--- /dev/null 1970-01-01 09:00:00.000000000 +0900
++++ b/patch_info/innodb_io_patches.info 2009-07-02 16:44:49.000000000 +0900
+@@ -0,0 +1,11 @@
++File=innodb_io_patches.patch
++Name=Cluster of past InnoDB IO patches
++Version=1.1
++Author=Percona
++License=GPL
++Comment=This patch contains fixed (control_flush_and_merge_and_read, control_io-threads, adaptive_flush)
++ChangeLog=
++2008-11-06
++YK: Initial release
++2009-01-09
++YK: Some parameters are added
+diff -ruN a/sql/ha_innodb.cc b/sql/ha_innodb.cc
+--- a/sql/ha_innodb.cc 2009-07-02 16:43:23.000000000 +0900
++++ b/sql/ha_innodb.cc 2009-07-02 16:44:49.000000000 +0900
+@@ -149,6 +149,7 @@
+ innobase_lock_wait_timeout, innobase_force_recovery,
+ innobase_open_files;
+
++long innobase_read_io_threads, innobase_write_io_threads;
+ longlong innobase_buffer_pool_size, innobase_log_file_size;
+
+ /* The default values for the following char* start-up parameters
+@@ -1417,6 +1418,8 @@
+ srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size;
+
+ srv_n_file_io_threads = (ulint) innobase_file_io_threads;
++ srv_n_read_io_threads = (ulint) innobase_read_io_threads;
++ srv_n_write_io_threads = (ulint) innobase_write_io_threads;
+
+ srv_lock_wait_timeout = (ulint) innobase_lock_wait_timeout;
+ srv_force_recovery = (ulint) innobase_force_recovery;
+@@ -7330,6 +7333,10 @@
+ trx_t* trx = check_trx_exists(thd);
+
+ if (thd->lex->sql_command != SQLCOM_XA_PREPARE) {
++ if (srv_enable_unsafe_group_commit && !thd->variables.innodb_support_xa) {
++ /* choose group commit rather than binlog order */
++ return(0);
++ }
+
+ /* For ibbackup to work the order of transactions in binlog
+ and InnoDB must be the same. Consider the situation
+diff -ruN a/sql/ha_innodb.h b/sql/ha_innodb.h
+--- a/sql/ha_innodb.h 2009-07-02 16:43:23.000000000 +0900
++++ b/sql/ha_innodb.h 2009-07-02 18:10:51.000000000 +0900
+@@ -204,6 +204,7 @@
+ extern long innobase_additional_mem_pool_size;
+ extern long innobase_buffer_pool_awe_mem_mb;
+ extern long innobase_file_io_threads, innobase_lock_wait_timeout;
++extern long innobase_read_io_threads, innobase_write_io_threads;
+ extern long innobase_force_recovery;
+ extern long innobase_open_files;
+ extern char *innobase_data_home_dir, *innobase_data_file_path;
+@@ -234,6 +235,15 @@
+ extern ulong srv_thread_concurrency;
+ extern ulong srv_commit_concurrency;
+ extern ulong srv_flush_log_at_trx_commit;
++extern ulong srv_io_capacity;
++extern long long srv_ibuf_max_size;
++extern ulong srv_ibuf_active_contract;
++extern ulong srv_ibuf_accel_rate;
++extern ulong srv_flush_neighbor_pages;
++extern ulong srv_enable_unsafe_group_commit;
++extern uint srv_read_ahead;
++extern uint srv_adaptive_checkpoint;
++
+ /* An option to enable the fix for "Bug#43660 SHOW INDEXES/ANALYZE does
+ NOT update cardinality for indexes of InnoDB table". By default we are
+ running with the fix disabled because MySQL 5.1 is frozen for such
+diff -ruN a/sql/mysqld.cc b/sql/mysqld.cc
+--- a/sql/mysqld.cc 2009-07-02 16:43:23.000000000 +0900
++++ b/sql/mysqld.cc 2009-07-02 18:00:04.000000000 +0900
+@@ -5086,6 +5086,16 @@
+ OPT_INNODB_ROLLBACK_ON_TIMEOUT,
+ OPT_SECURE_FILE_PRIV,
+ OPT_KEEP_FILES_ON_CREATE,
++ OPT_INNODB_IO_CAPACITY,
++ OPT_INNODB_IBUF_MAX_SIZE,
++ OPT_INNODB_IBUF_ACTIVE_CONTRACT,
++ OPT_INNODB_IBUF_ACCEL_RATE,
++ OPT_INNODB_FLUSH_NEIGHBOR_PAGES,
++ OPT_INNODB_ENABLE_UNSAFE_GROUP_COMMIT,
++ OPT_INNODB_READ_AHEAD,
++ OPT_INNODB_ADAPTIVE_CHECKPOINT,
++ OPT_INNODB_READ_IO_THREADS,
++ OPT_INNODB_WRITE_IO_THREADS,
+ OPT_INNODB_ADAPTIVE_HASH_INDEX,
+ OPT_FEDERATED,
+ OPT_INNODB_USE_LEGACY_CARDINALITY_ALGORITHM
+@@ -5403,6 +5413,44 @@
+ (gptr*) &srv_use_legacy_cardinality_algorithm,
+ (gptr*) &srv_use_legacy_cardinality_algorithm,
+ 0, GET_BOOL, OPT_ARG, 1, 0, 0, 0, 0, 0},
++ {"innodb_io_capacity", OPT_INNODB_IO_CAPACITY,
++ "Number of IO operations per second the server can do. Tunes background IO rate.",
++ (gptr*) &srv_io_capacity, (gptr*) &srv_io_capacity,
++ 0, GET_ULONG, REQUIRED_ARG, 200, 100, 999999999, 0, 0, 0},
++ {"innodb_ibuf_max_size", OPT_INNODB_IBUF_MAX_SIZE,
++ "The maximum size of the insert buffer. (in bytes)",
++ (gptr*) &srv_ibuf_max_size, (gptr*) &srv_ibuf_max_size, 0,
++ GET_LL, REQUIRED_ARG, LONGLONG_MAX, 0, LONGLONG_MAX, 0, 0, 0},
++ {"innodb_ibuf_active_contract", OPT_INNODB_IBUF_ACTIVE_CONTRACT,
++ "Enable/Disable active_contract of insert buffer. 0:disable 1:enable",
++ (gptr*) &srv_ibuf_active_contract, (gptr*) &srv_ibuf_active_contract,
++ 0, GET_ULONG, REQUIRED_ARG, 0, 0, 1, 0, 0, 0},
++ {"innodb_ibuf_accel_rate", OPT_INNODB_IBUF_ACCEL_RATE,
++ "Tunes amount of insert buffer processing of background, in addition to innodb_io_capacity. (in percentage)",
++ (gptr*) &srv_ibuf_accel_rate, (gptr*) &srv_ibuf_accel_rate,
++ 0, GET_ULONG, REQUIRED_ARG, 100, 100, 999999999, 0, 0, 0},
++ {"innodb_flush_neighbor_pages", OPT_INNODB_FLUSH_NEIGHBOR_PAGES,
++ "Enable/Disable flushing also neighbor pages. 0:disable 1:enable",
++ (gptr*) &srv_flush_neighbor_pages, (gptr*) &srv_flush_neighbor_pages,
++ 0, GET_ULONG, REQUIRED_ARG, 1, 0, 1, 0, 0, 0},
++ {"innodb_read_ahead", OPT_INNODB_READ_AHEAD,
++ "Control read ahead activity. (none, random, linear, [both])",
++ 0, 0, 0, GET_ULONG, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
++ {"innodb_adaptive_checkpoint", OPT_INNODB_ADAPTIVE_CHECKPOINT,
++ "Enable/Diasable flushing along modified age. ([none], reflex, estimate)",
++ 0, 0, 0, GET_ULONG, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
++ {"innodb_enable_unsafe_group_commit", OPT_INNODB_ENABLE_UNSAFE_GROUP_COMMIT,
++ "Enable/Disable unsafe group commit when support_xa=OFF and use with binlog or other XA storage engine.",
++ (gptr*) &srv_enable_unsafe_group_commit, (gptr*) &srv_enable_unsafe_group_commit,
++ 0, GET_ULONG, REQUIRED_ARG, 0, 0, 1, 0, 0, 0},
++ {"innodb_read_io_threads", OPT_INNODB_READ_IO_THREADS,
++ "Number of background read I/O threads in InnoDB.",
++ (gptr*) &innobase_read_io_threads, (gptr*) &innobase_read_io_threads,
++ 0, GET_LONG, REQUIRED_ARG, 8, 1, 64, 0, 0, 0},
++ {"innodb_write_io_threads", OPT_INNODB_WRITE_IO_THREADS,
++ "Number of background write I/O threads in InnoDB.",
++ (gptr*) &innobase_write_io_threads, (gptr*) &innobase_write_io_threads,
++ 0, GET_LONG, REQUIRED_ARG, 8, 1, 64, 0, 0, 0},
+ #endif /* End HAVE_INNOBASE_DB */
+ {"isam", OPT_ISAM, "Obsolete. ISAM storage engine is no longer supported.",
+ (gptr*) &opt_isam, (gptr*) &opt_isam, 0, GET_BOOL, NO_ARG, 0, 0, 0,
+@@ -7644,6 +7692,38 @@
+ case OPT_INNODB_LOG_ARCHIVE:
+ innobase_log_archive= argument ? test(atoi(argument)) : 1;
+ break;
++ case OPT_INNODB_READ_AHEAD:
++ if (argument == disabled_my_option)
++ srv_read_ahead = 0;
++ else if (! argument)
++ srv_read_ahead = 3;
++ else
++ {
++ int type;
++ if ((type=find_type(argument, &innodb_read_ahead_typelib, 2)) <= 0)
++ {
++ fprintf(stderr,"Unknown innodb_read_ahead type: %s\n",argument);
++ exit(1);
++ }
++ srv_read_ahead = (uint) ((type - 1) & 3);
++ }
++ break;
++ case OPT_INNODB_ADAPTIVE_CHECKPOINT:
++ if (argument == disabled_my_option)
++ srv_adaptive_checkpoint = 0;
++ else if (! argument)
++ srv_adaptive_checkpoint = 0;
++ else
++ {
++ int type;
++ if ((type=find_type(argument, &innodb_adaptive_checkpoint_typelib, 2)) <= 0)
++ {
++ fprintf(stderr,"Unknown innodb_adaptive_checkpoint type: %s\n",argument);
++ exit(1);
++ }
++ srv_adaptive_checkpoint = (uint) ((type - 1) % 3);
++ }
++ break;
+ #endif /* HAVE_INNOBASE_DB */
+ case OPT_MYISAM_RECOVER:
+ {
+diff -ruN a/sql/set_var.cc b/sql/set_var.cc
+--- a/sql/set_var.cc 2009-07-02 16:43:23.000000000 +0900
++++ b/sql/set_var.cc 2009-07-02 17:45:29.000000000 +0900
+@@ -489,6 +489,57 @@
+ sys_var_long_ptr sys_innodb_flush_log_at_trx_commit(
+ "innodb_flush_log_at_trx_commit",
+ &srv_flush_log_at_trx_commit);
++sys_var_long_ptr sys_innodb_io_capacity("innodb_io_capacity",
++ &srv_io_capacity);
++sys_var_long_ptr sys_innodb_ibuf_active_contract("innodb_ibuf_active_contract",
++ &srv_ibuf_active_contract);
++sys_var_long_ptr sys_innodb_ibuf_accel_rate("innodb_ibuf_accel_rate",
++ &srv_ibuf_accel_rate);
++sys_var_long_ptr sys_innodb_flush_neighbor_pages("innodb_flush_neighbor_pages",
++ &srv_flush_neighbor_pages);
++
++const char *innodb_read_ahead_names[]=
++{
++ "none", /* 0 */
++ "random",
++ "linear",
++ "both", /* 3 */
++ /* For compatibility of the older patch */
++ "0", /* 4 ("none" + 4) */
++ "1",
++ "2",
++ "3", /* 7 ("both" + 4) */
++ NullS
++};
++TYPELIB innodb_read_ahead_typelib=
++{
++ array_elements(innodb_read_ahead_names) - 1, "innodb_read_ahead_typelib",
++ innodb_read_ahead_names, NULL
++};
++sys_var_enum sys_innodb_read_ahead("innodb_read_ahead", &srv_read_ahead,
++ &innodb_read_ahead_typelib, fix_innodb_read_ahead);
++sys_var_long_ptr sys_innodb_enable_unsafe_group_commit("innodb_enable_unsafe_group_commit",
++ &srv_enable_unsafe_group_commit);
++
++const char *innodb_adaptive_checkpoint_names[]=
++{
++ "none", /* 0 */
++ "reflex", /* 1 */
++ "estimate", /* 2 */
++ /* For compatibility of the older patch */
++ "0", /* 3 ("none" + 3) */
++ "1", /* 4 ("reflex" + 3) */
++ "2", /* 5 ("estimate" + 3) */
++ NullS
++};
++TYPELIB innodb_adaptive_checkpoint_typelib=
++{
++ array_elements(innodb_adaptive_checkpoint_names) - 1, "innodb_adaptive_checkpoint_typelib",
++ innodb_adaptive_checkpoint_names, NULL
++};
++sys_var_enum sys_innodb_adaptive_checkpoint("innodb_adaptive_checkpoint",
++ &srv_adaptive_checkpoint,
++ &innodb_adaptive_checkpoint_typelib, fix_innodb_adaptive_checkpoint);
+ sys_var_const_os_str_ptr sys_innodb_data_file_path("innodb_data_file_path",
+ &innobase_data_file_path);
+ sys_var_const_os_str_ptr sys_innodb_data_home_dir("innodb_data_home_dir",
+@@ -860,6 +911,13 @@
+ &sys_innodb_thread_concurrency,
+ &sys_innodb_commit_concurrency,
+ &sys_innodb_flush_log_at_trx_commit,
++ &sys_innodb_io_capacity,
++ &sys_innodb_ibuf_active_contract,
++ &sys_innodb_ibuf_accel_rate,
++ &sys_innodb_flush_neighbor_pages,
++ &sys_innodb_read_ahead,
++ &sys_innodb_enable_unsafe_group_commit,
++ &sys_innodb_adaptive_checkpoint,
+ #endif
+ &sys_trust_routine_creators,
+ &sys_trust_function_creators,
+@@ -997,6 +1055,16 @@
+ {sys_innodb_table_locks.name, (char*) &sys_innodb_table_locks, SHOW_SYS},
+ {sys_innodb_thread_concurrency.name, (char*) &sys_innodb_thread_concurrency, SHOW_SYS},
+ {sys_innodb_thread_sleep_delay.name, (char*) &sys_innodb_thread_sleep_delay, SHOW_SYS},
++ {sys_innodb_io_capacity.name, (char*) &sys_innodb_io_capacity, SHOW_SYS},
++ {"innodb_ibuf_max_size", (char*) &srv_ibuf_max_size, SHOW_LONGLONG},
++ {sys_innodb_ibuf_active_contract.name, (char*) &sys_innodb_ibuf_active_contract, SHOW_SYS},
++ {sys_innodb_ibuf_accel_rate.name, (char*) &sys_innodb_ibuf_accel_rate, SHOW_SYS},
++ {sys_innodb_flush_neighbor_pages.name, (char*) &sys_innodb_flush_neighbor_pages, SHOW_SYS},
++ {sys_innodb_read_ahead.name, (char*) &sys_innodb_read_ahead, SHOW_SYS},
++ {sys_innodb_enable_unsafe_group_commit.name, (char*) &sys_innodb_enable_unsafe_group_commit, SHOW_SYS},
++ {sys_innodb_adaptive_checkpoint.name, (char*) &sys_innodb_adaptive_checkpoint, SHOW_SYS},
++ {"innodb_read_io_threads", (char*) &innobase_read_io_threads, SHOW_LONG},
++ {"innodb_write_io_threads", (char*) &innobase_write_io_threads, SHOW_LONG},
+ {sys_innodb_use_legacy_cardinality_algorithm.name,
+ (char*) &sys_innodb_use_legacy_cardinality_algorithm, SHOW_SYS},
+ #endif
+@@ -1459,6 +1527,18 @@
+ }
+ }
+
++#ifdef HAVE_INNOBASE_DB
++extern void fix_innodb_read_ahead(THD *thd, enum_var_type type)
++{
++ srv_read_ahead &= 3;
++}
++
++extern void fix_innodb_adaptive_checkpoint(THD *thd, enum_var_type type)
++{
++ srv_adaptive_checkpoint %= 3;
++}
++#endif /* HAVE_INNOBASE_DB */
++
+ static void fix_max_binlog_size(THD *thd, enum_var_type type)
+ {
+ DBUG_ENTER("fix_max_binlog_size");
+diff -ruN a/sql/set_var.h b/sql/set_var.h
+--- a/sql/set_var.h 2009-07-02 16:43:23.000000000 +0900
++++ b/sql/set_var.h 2009-07-02 17:35:17.000000000 +0900
+@@ -31,6 +31,11 @@
+
+ extern TYPELIB bool_typelib, delay_key_write_typelib, sql_mode_typelib;
+
++#ifdef HAVE_INNOBASE_DB
++extern TYPELIB innodb_read_ahead_typelib;
++extern TYPELIB innodb_adaptive_checkpoint_typelib;
++#endif /* HAVE_INNOBASE_DB */
++
+ typedef int (*sys_check_func)(THD *, set_var *);
+ typedef bool (*sys_update_func)(THD *, set_var *);
+ typedef void (*sys_after_update_func)(THD *,enum_var_type);
+@@ -1148,6 +1153,10 @@
+ int sql_set_variables(THD *thd, List<set_var_base> *var_list);
+ bool not_all_support_one_shot(List<set_var_base> *var_list);
+ void fix_delay_key_write(THD *thd, enum_var_type type);
++#ifdef HAVE_INNOBASE_DB
++void fix_innodb_read_ahead(THD *thd, enum_var_type type);
++void fix_innodb_adaptive_checkpoint(THD *thd, enum_var_type type);
++#endif /* HAVE_INNOBASE_DB */
+ ulong fix_sql_mode(ulong sql_mode);
+ extern sys_var_const_str sys_charset_system;
+ extern sys_var_str sys_init_connect;