block migration: do not submit multiple AIOs for same sector
Block migration can submit multiple AIO reads for the same sector/chunk, but
completion of such reads can happen out of order:
migration               guest
- get_dirty(N)
- aio_read(N)
- clear_dirty(N)
                        write(N)
                        set_dirty(N)
- get_dirty(N)
- aio_read(N)
If the first aio_read completes after the second, stale data will be
migrated to the destination.
Fix by not allowing multiple AIOs inflight for the same sector.
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
			
			
This commit is contained in:
		
							parent
							
								
									4dcafbb1eb
								
							
						
					
					
						commit
						33656af702
					
				| 
						 | 
				
			
			@ -49,12 +49,14 @@ typedef struct BlkMigDevState {
 | 
			
		|||
    int64_t total_sectors;
 | 
			
		||||
    int64_t dirty;
 | 
			
		||||
    QSIMPLEQ_ENTRY(BlkMigDevState) entry;
 | 
			
		||||
    unsigned long *aio_bitmap;
 | 
			
		||||
} BlkMigDevState;
 | 
			
		||||
 | 
			
		||||
typedef struct BlkMigBlock {
 | 
			
		||||
    uint8_t *buf;
 | 
			
		||||
    BlkMigDevState *bmds;
 | 
			
		||||
    int64_t sector;
 | 
			
		||||
    int nr_sectors;
 | 
			
		||||
    struct iovec iov;
 | 
			
		||||
    QEMUIOVector qiov;
 | 
			
		||||
    BlockDriverAIOCB *aiocb;
 | 
			
		||||
| 
						 | 
				
			
			@ -140,6 +142,57 @@ static inline long double compute_read_bwidth(void)
 | 
			
		|||
    return  (block_mig_state.reads * BLOCK_SIZE)/ block_mig_state.total_time;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
 | 
			
		||||
{
 | 
			
		||||
    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
 | 
			
		||||
 | 
			
		||||
    if (bmds->aio_bitmap &&
 | 
			
		||||
        (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bmds->bs)) {
 | 
			
		||||
        return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
 | 
			
		||||
            (1UL << (chunk % (sizeof(unsigned long) * 8))));
 | 
			
		||||
    } else {
 | 
			
		||||
        return 0;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num,
 | 
			
		||||
                             int nb_sectors, int set)
 | 
			
		||||
{
 | 
			
		||||
    int64_t start, end;
 | 
			
		||||
    unsigned long val, idx, bit;
 | 
			
		||||
 | 
			
		||||
    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
 | 
			
		||||
    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
 | 
			
		||||
 | 
			
		||||
    for (; start <= end; start++) {
 | 
			
		||||
        idx = start / (sizeof(unsigned long) * 8);
 | 
			
		||||
        bit = start % (sizeof(unsigned long) * 8);
 | 
			
		||||
        val = bmds->aio_bitmap[idx];
 | 
			
		||||
        if (set) {
 | 
			
		||||
            if (!(val & (1UL << bit))) {
 | 
			
		||||
                val |= 1UL << bit;
 | 
			
		||||
            }
 | 
			
		||||
        } else {
 | 
			
		||||
            if (val & (1UL << bit)) {
 | 
			
		||||
                val &= ~(1UL << bit);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        bmds->aio_bitmap[idx] = val;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void alloc_aio_bitmap(BlkMigDevState *bmds)
 | 
			
		||||
{
 | 
			
		||||
    BlockDriverState *bs = bmds->bs;
 | 
			
		||||
    int64_t bitmap_size;
 | 
			
		||||
 | 
			
		||||
    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
 | 
			
		||||
            BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
 | 
			
		||||
    bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
 | 
			
		||||
 | 
			
		||||
    bmds->aio_bitmap = qemu_mallocz(bitmap_size);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void blk_mig_read_cb(void *opaque, int ret)
 | 
			
		||||
{
 | 
			
		||||
    BlkMigBlock *blk = opaque;
 | 
			
		||||
| 
						 | 
				
			
			@ -151,6 +204,7 @@ static void blk_mig_read_cb(void *opaque, int ret)
 | 
			
		|||
    add_avg_read_time(blk->time);
 | 
			
		||||
 | 
			
		||||
    QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
 | 
			
		||||
    bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0);
 | 
			
		||||
 | 
			
		||||
    block_mig_state.submitted--;
 | 
			
		||||
    block_mig_state.read_done++;
 | 
			
		||||
| 
						 | 
				
			
			@ -194,6 +248,7 @@ static int mig_save_device_bulk(Monitor *mon, QEMUFile *f,
 | 
			
		|||
    blk->buf = qemu_malloc(BLOCK_SIZE);
 | 
			
		||||
    blk->bmds = bmds;
 | 
			
		||||
    blk->sector = cur_sector;
 | 
			
		||||
    blk->nr_sectors = nr_sectors;
 | 
			
		||||
 | 
			
		||||
    blk->iov.iov_base = blk->buf;
 | 
			
		||||
    blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
 | 
			
		||||
| 
						 | 
				
			
			@ -248,6 +303,7 @@ static void init_blk_migration_it(void *opaque, BlockDriverState *bs)
 | 
			
		|||
        bmds->total_sectors = sectors;
 | 
			
		||||
        bmds->completed_sectors = 0;
 | 
			
		||||
        bmds->shared_base = block_mig_state.shared_base;
 | 
			
		||||
        alloc_aio_bitmap(bmds);
 | 
			
		||||
 | 
			
		||||
        block_mig_state.total_sector_sum += sectors;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -329,6 +385,8 @@ static int mig_save_device_dirty(Monitor *mon, QEMUFile *f,
 | 
			
		|||
    int nr_sectors;
 | 
			
		||||
 | 
			
		||||
    for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
 | 
			
		||||
        if (bmds_aio_inflight(bmds, sector))
 | 
			
		||||
            qemu_aio_flush();
 | 
			
		||||
        if (bdrv_get_dirty(bmds->bs, sector)) {
 | 
			
		||||
 | 
			
		||||
            if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
 | 
			
		||||
| 
						 | 
				
			
			@ -340,6 +398,7 @@ static int mig_save_device_dirty(Monitor *mon, QEMUFile *f,
 | 
			
		|||
            blk->buf = qemu_malloc(BLOCK_SIZE);
 | 
			
		||||
            blk->bmds = bmds;
 | 
			
		||||
            blk->sector = sector;
 | 
			
		||||
            blk->nr_sectors = nr_sectors;
 | 
			
		||||
 | 
			
		||||
            if (is_async) {
 | 
			
		||||
                blk->iov.iov_base = blk->buf;
 | 
			
		||||
| 
						 | 
				
			
			@ -354,6 +413,7 @@ static int mig_save_device_dirty(Monitor *mon, QEMUFile *f,
 | 
			
		|||
                    goto error;
 | 
			
		||||
                }
 | 
			
		||||
                block_mig_state.submitted++;
 | 
			
		||||
                bmds_set_aio_inflight(bmds, sector, nr_sectors, 1);
 | 
			
		||||
            } else {
 | 
			
		||||
                if (bdrv_read(bmds->bs, sector, blk->buf,
 | 
			
		||||
                              nr_sectors) < 0) {
 | 
			
		||||
| 
						 | 
				
			
			@ -474,6 +534,7 @@ static void blk_mig_cleanup(Monitor *mon)
 | 
			
		|||
 | 
			
		||||
    while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
 | 
			
		||||
        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
 | 
			
		||||
        qemu_free(bmds->aio_bitmap);
 | 
			
		||||
        qemu_free(bmds);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue