1694 lines
		
	
	
		
			54 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			1694 lines
		
	
	
		
			54 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| // SPDX-License-Identifier: GPL-2.0-only
 | |
| /*
 | |
|  * Copyright 2023 Red Hat
 | |
|  */
 | |
| 
 | |
| #include "volume.h"
 | |
| 
 | |
| #include <linux/atomic.h>
 | |
| #include <linux/dm-bufio.h>
 | |
| #include <linux/err.h>
 | |
| 
 | |
| #include "errors.h"
 | |
| #include "logger.h"
 | |
| #include "memory-alloc.h"
 | |
| #include "permassert.h"
 | |
| #include "string-utils.h"
 | |
| #include "thread-utils.h"
 | |
| 
 | |
| #include "chapter-index.h"
 | |
| #include "config.h"
 | |
| #include "geometry.h"
 | |
| #include "hash-utils.h"
 | |
| #include "index.h"
 | |
| #include "sparse-cache.h"
 | |
| 
 | |
| /*
 | |
|  * The first block of the volume layout is reserved for the volume header, which is no longer used.
 | |
|  * The remainder of the volume is divided into chapters consisting of several pages of records, and
 | |
|  * several pages of static index to use to find those records. The index pages are recorded first,
 | |
|  * followed by the record pages. The chapters are written in order as they are filled, so the
 | |
|  * volume storage acts as a circular log of the most recent chapters, with each new chapter
 | |
|  * overwriting the oldest saved one.
 | |
|  *
 | |
|  * When a new chapter is filled and closed, the records from that chapter are sorted and
 | |
|  * interleaved in approximate temporal order, and assigned to record pages. Then a static delta
 | |
|  * index is generated to store which record page contains each record. The in-memory index page map
 | |
|  * is also updated to indicate which delta lists fall on each chapter index page. This means that
 | |
|  * when a record is read, the volume only has to load a single index page and a single record page,
 | |
|  * rather than search the entire chapter. These index and record pages are written to storage, and
 | |
|  * the index pages are transferred to the page cache under the theory that the most recently
 | |
|  * written chapter is likely to be accessed again soon.
 | |
|  *
 | |
|  * When reading a record, the volume index will indicate which chapter should contain it. The
 | |
|  * volume uses the index page map to determine which chapter index page needs to be loaded, and
 | |
|  * then reads the relevant record page number from the chapter index. Both index and record pages
 | |
|  * are stored in a page cache when read for the common case that subsequent records need the same
 | |
|  * pages. The page cache evicts the least recently accessed entries when caching new pages. In
 | |
|  * addition, the volume uses dm-bufio to manage access to the storage, which may allow for
 | |
|  * additional caching depending on available system resources.
 | |
|  *
 | |
|  * Record requests are handled from cached pages when possible. If a page needs to be read, it is
 | |
|  * placed on a queue along with the request that wants to read it. Any requests for the same page
 | |
|  * that arrive while the read is pending are added to the queue entry. A separate reader thread
 | |
|  * handles the queued reads, adding the page to the cache and updating any requests queued with it
 | |
|  * so they can continue processing. This allows the index zone threads to continue processing new
 | |
|  * requests rather than wait for the storage reads.
 | |
|  *
 | |
|  * When an index rebuild is necessary, the volume reads each stored chapter to determine which
 | |
|  * range of chapters contain valid records, so that those records can be used to reconstruct the
 | |
|  * in-memory volume index.
 | |
|  */
 | |
| 
 | |
| /* The maximum allowable number of contiguous bad chapters */
 | |
| #define MAX_BAD_CHAPTERS 100
 | |
| #define VOLUME_CACHE_MAX_ENTRIES (U16_MAX >> 1)
 | |
| #define VOLUME_CACHE_QUEUED_FLAG (1 << 15)
 | |
| #define VOLUME_CACHE_MAX_QUEUED_READS 4096
 | |
| 
 | |
| static const u64 BAD_CHAPTER = U64_MAX;
 | |
| 
 | |
| /*
 | |
|  * The invalidate counter is two 32 bits fields stored together atomically. The low order 32 bits
 | |
|  * are the physical page number of the cached page being read. The high order 32 bits are a
 | |
|  * sequence number. This value is written when the zone that owns it begins or completes a cache
 | |
|  * search. Any other thread will only read the counter in wait_for_pending_searches() while waiting
 | |
|  * to update the cache contents.
 | |
|  */
 | |
| union invalidate_counter {
 | |
| 	u64 value;
 | |
| 	struct {
 | |
| 		u32 page;
 | |
| 		u32 counter;
 | |
| 	};
 | |
| };
 | |
| 
 | |
| static inline u32 map_to_page_number(struct index_geometry *geometry, u32 physical_page)
 | |
| {
 | |
| 	return (physical_page - HEADER_PAGES_PER_VOLUME) % geometry->pages_per_chapter;
 | |
| }
 | |
| 
 | |
| static inline u32 map_to_chapter_number(struct index_geometry *geometry, u32 physical_page)
 | |
| {
 | |
| 	return (physical_page - HEADER_PAGES_PER_VOLUME) / geometry->pages_per_chapter;
 | |
| }
 | |
| 
 | |
| static inline bool is_record_page(struct index_geometry *geometry, u32 physical_page)
 | |
| {
 | |
| 	return map_to_page_number(geometry, physical_page) >= geometry->index_pages_per_chapter;
 | |
| }
 | |
| 
 | |
| static u32 map_to_physical_page(const struct index_geometry *geometry, u32 chapter, u32 page)
 | |
| {
 | |
| 	/* Page zero is the header page, so the first chapter index page is page one. */
 | |
| 	return HEADER_PAGES_PER_VOLUME + (geometry->pages_per_chapter * chapter) + page;
 | |
| }
 | |
| 
 | |
| static inline union invalidate_counter get_invalidate_counter(struct page_cache *cache,
 | |
| 							      unsigned int zone_number)
 | |
| {
 | |
| 	return (union invalidate_counter) {
 | |
| 		.value = READ_ONCE(cache->search_pending_counters[zone_number].atomic_value),
 | |
| 	};
 | |
| }
 | |
| 
 | |
| static inline void set_invalidate_counter(struct page_cache *cache,
 | |
| 					  unsigned int zone_number,
 | |
| 					  union invalidate_counter invalidate_counter)
 | |
| {
 | |
| 	WRITE_ONCE(cache->search_pending_counters[zone_number].atomic_value,
 | |
| 		   invalidate_counter.value);
 | |
| }
 | |
| 
 | |
| static inline bool search_pending(union invalidate_counter invalidate_counter)
 | |
| {
 | |
| 	return (invalidate_counter.counter & 1) != 0;
 | |
| }
 | |
| 
 | |
| /* Lock the cache for a zone in order to search for a page. */
 | |
| static void begin_pending_search(struct page_cache *cache, u32 physical_page,
 | |
| 				 unsigned int zone_number)
 | |
| {
 | |
| 	union invalidate_counter invalidate_counter =
 | |
| 		get_invalidate_counter(cache, zone_number);
 | |
| 
 | |
| 	invalidate_counter.page = physical_page;
 | |
| 	invalidate_counter.counter++;
 | |
| 	set_invalidate_counter(cache, zone_number, invalidate_counter);
 | |
| 	VDO_ASSERT_LOG_ONLY(search_pending(invalidate_counter),
 | |
| 			    "Search is pending for zone %u", zone_number);
 | |
| 	/*
 | |
| 	 * This memory barrier ensures that the write to the invalidate counter is seen by other
 | |
| 	 * threads before this thread accesses the cached page. The corresponding read memory
 | |
| 	 * barrier is in wait_for_pending_searches().
 | |
| 	 */
 | |
| 	smp_mb();
 | |
| }
 | |
| 
 | |
| /* Unlock the cache for a zone by clearing its invalidate counter. */
 | |
| static void end_pending_search(struct page_cache *cache, unsigned int zone_number)
 | |
| {
 | |
| 	union invalidate_counter invalidate_counter;
 | |
| 
 | |
| 	/*
 | |
| 	 * This memory barrier ensures that this thread completes reads of the
 | |
| 	 * cached page before other threads see the write to the invalidate
 | |
| 	 * counter.
 | |
| 	 */
 | |
| 	smp_mb();
 | |
| 
 | |
| 	invalidate_counter = get_invalidate_counter(cache, zone_number);
 | |
| 	VDO_ASSERT_LOG_ONLY(search_pending(invalidate_counter),
 | |
| 			    "Search is pending for zone %u", zone_number);
 | |
| 	invalidate_counter.counter++;
 | |
| 	set_invalidate_counter(cache, zone_number, invalidate_counter);
 | |
| }
 | |
| 
 | |
| static void wait_for_pending_searches(struct page_cache *cache, u32 physical_page)
 | |
| {
 | |
| 	union invalidate_counter initial_counters[MAX_ZONES];
 | |
| 	unsigned int i;
 | |
| 
 | |
| 	/*
 | |
| 	 * We hold the read_threads_mutex. We are waiting for threads that do not hold the
 | |
| 	 * read_threads_mutex. Those threads have "locked" their targeted page by setting the
 | |
| 	 * search_pending_counter. The corresponding write memory barrier is in
 | |
| 	 * begin_pending_search().
 | |
| 	 */
 | |
| 	smp_mb();
 | |
| 
 | |
| 	for (i = 0; i < cache->zone_count; i++)
 | |
| 		initial_counters[i] = get_invalidate_counter(cache, i);
 | |
| 	for (i = 0; i < cache->zone_count; i++) {
 | |
| 		if (search_pending(initial_counters[i]) &&
 | |
| 		    (initial_counters[i].page == physical_page)) {
 | |
| 			/*
 | |
| 			 * There is an active search using the physical page. We need to wait for
 | |
| 			 * the search to finish.
 | |
| 			 *
 | |
| 			 * FIXME: Investigate using wait_event() to wait for the search to finish.
 | |
| 			 */
 | |
| 			while (initial_counters[i].value ==
 | |
| 			       get_invalidate_counter(cache, i).value)
 | |
| 				cond_resched();
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| static void release_page_buffer(struct cached_page *page)
 | |
| {
 | |
| 	if (page->buffer != NULL)
 | |
| 		dm_bufio_release(vdo_forget(page->buffer));
 | |
| }
 | |
| 
 | |
| static void clear_cache_page(struct page_cache *cache, struct cached_page *page)
 | |
| {
 | |
| 	/* Do not clear read_pending because the read queue relies on it. */
 | |
| 	release_page_buffer(page);
 | |
| 	page->physical_page = cache->indexable_pages;
 | |
| 	WRITE_ONCE(page->last_used, 0);
 | |
| }
 | |
| 
 | |
| static void make_page_most_recent(struct page_cache *cache, struct cached_page *page)
 | |
| {
 | |
| 	/*
 | |
| 	 * ASSERTION: We are either a zone thread holding a search_pending_counter, or we are any
 | |
| 	 * thread holding the read_threads_mutex.
 | |
| 	 */
 | |
| 	if (atomic64_read(&cache->clock) != READ_ONCE(page->last_used))
 | |
| 		WRITE_ONCE(page->last_used, atomic64_inc_return(&cache->clock));
 | |
| }
 | |
| 
 | |
| /* Select a page to remove from the cache to make space for a new entry. */
 | |
| static struct cached_page *select_victim_in_cache(struct page_cache *cache)
 | |
| {
 | |
| 	struct cached_page *page;
 | |
| 	int oldest_index = 0;
 | |
| 	s64 oldest_time = S64_MAX;
 | |
| 	s64 last_used;
 | |
| 	u16 i;
 | |
| 
 | |
| 	/* Find the oldest unclaimed page. We hold the read_threads_mutex. */
 | |
| 	for (i = 0; i < cache->cache_slots; i++) {
 | |
| 		/* A page with a pending read must not be replaced. */
 | |
| 		if (cache->cache[i].read_pending)
 | |
| 			continue;
 | |
| 
 | |
| 		last_used = READ_ONCE(cache->cache[i].last_used);
 | |
| 		if (last_used <= oldest_time) {
 | |
| 			oldest_time = last_used;
 | |
| 			oldest_index = i;
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	page = &cache->cache[oldest_index];
 | |
| 	if (page->physical_page != cache->indexable_pages) {
 | |
| 		WRITE_ONCE(cache->index[page->physical_page], cache->cache_slots);
 | |
| 		wait_for_pending_searches(cache, page->physical_page);
 | |
| 	}
 | |
| 
 | |
| 	page->read_pending = true;
 | |
| 	clear_cache_page(cache, page);
 | |
| 	return page;
 | |
| }
 | |
| 
 | |
| /* Make a newly filled cache entry available to other threads. */
 | |
| static int put_page_in_cache(struct page_cache *cache, u32 physical_page,
 | |
| 			     struct cached_page *page)
 | |
| {
 | |
| 	int result;
 | |
| 
 | |
| 	/* We hold the read_threads_mutex. */
 | |
| 	result = VDO_ASSERT((page->read_pending), "page to install has a pending read");
 | |
| 	if (result != VDO_SUCCESS)
 | |
| 		return result;
 | |
| 
 | |
| 	page->physical_page = physical_page;
 | |
| 	make_page_most_recent(cache, page);
 | |
| 	page->read_pending = false;
 | |
| 
 | |
| 	/*
 | |
| 	 * We hold the read_threads_mutex, but we must have a write memory barrier before making
 | |
| 	 * the cached_page available to the readers that do not hold the mutex. The corresponding
 | |
| 	 * read memory barrier is in get_page_and_index().
 | |
| 	 */
 | |
| 	smp_wmb();
 | |
| 
 | |
| 	/* This assignment also clears the queued flag. */
 | |
| 	WRITE_ONCE(cache->index[physical_page], page - cache->cache);
 | |
| 	return UDS_SUCCESS;
 | |
| }
 | |
| 
 | |
| static void cancel_page_in_cache(struct page_cache *cache, u32 physical_page,
 | |
| 				 struct cached_page *page)
 | |
| {
 | |
| 	int result;
 | |
| 
 | |
| 	/* We hold the read_threads_mutex. */
 | |
| 	result = VDO_ASSERT((page->read_pending), "page to install has a pending read");
 | |
| 	if (result != VDO_SUCCESS)
 | |
| 		return;
 | |
| 
 | |
| 	clear_cache_page(cache, page);
 | |
| 	page->read_pending = false;
 | |
| 
 | |
| 	/* Clear the mapping and the queued flag for the new page. */
 | |
| 	WRITE_ONCE(cache->index[physical_page], cache->cache_slots);
 | |
| }
 | |
| 
 | |
| static inline u16 next_queue_position(u16 position)
 | |
| {
 | |
| 	return (position + 1) % VOLUME_CACHE_MAX_QUEUED_READS;
 | |
| }
 | |
| 
 | |
| static inline void advance_queue_position(u16 *position)
 | |
| {
 | |
| 	*position = next_queue_position(*position);
 | |
| }
 | |
| 
 | |
| static inline bool read_queue_is_full(struct page_cache *cache)
 | |
| {
 | |
| 	return cache->read_queue_first == next_queue_position(cache->read_queue_last);
 | |
| }
 | |
| 
 | |
| static bool enqueue_read(struct page_cache *cache, struct uds_request *request,
 | |
| 			 u32 physical_page)
 | |
| {
 | |
| 	struct queued_read *queue_entry;
 | |
| 	u16 last = cache->read_queue_last;
 | |
| 	u16 read_queue_index;
 | |
| 
 | |
| 	/* We hold the read_threads_mutex. */
 | |
| 	if ((cache->index[physical_page] & VOLUME_CACHE_QUEUED_FLAG) == 0) {
 | |
| 		/* This page has no existing entry in the queue. */
 | |
| 		if (read_queue_is_full(cache))
 | |
| 			return false;
 | |
| 
 | |
| 		/* Fill in the read queue entry. */
 | |
| 		cache->read_queue[last].physical_page = physical_page;
 | |
| 		cache->read_queue[last].invalid = false;
 | |
| 		cache->read_queue[last].first_request = NULL;
 | |
| 		cache->read_queue[last].last_request = NULL;
 | |
| 
 | |
| 		/* Point the cache index to the read queue entry. */
 | |
| 		read_queue_index = last;
 | |
| 		WRITE_ONCE(cache->index[physical_page],
 | |
| 			   read_queue_index | VOLUME_CACHE_QUEUED_FLAG);
 | |
| 
 | |
| 		advance_queue_position(&cache->read_queue_last);
 | |
| 	} else {
 | |
| 		/* It's already queued, so add this request to the existing entry. */
 | |
| 		read_queue_index = cache->index[physical_page] & ~VOLUME_CACHE_QUEUED_FLAG;
 | |
| 	}
 | |
| 
 | |
| 	request->next_request = NULL;
 | |
| 	queue_entry = &cache->read_queue[read_queue_index];
 | |
| 	if (queue_entry->first_request == NULL)
 | |
| 		queue_entry->first_request = request;
 | |
| 	else
 | |
| 		queue_entry->last_request->next_request = request;
 | |
| 	queue_entry->last_request = request;
 | |
| 
 | |
| 	return true;
 | |
| }
 | |
| 
 | |
| static void enqueue_page_read(struct volume *volume, struct uds_request *request,
 | |
| 			      u32 physical_page)
 | |
| {
 | |
| 	/* Mark the page as queued, so that chapter invalidation knows to cancel a read. */
 | |
| 	while (!enqueue_read(&volume->page_cache, request, physical_page)) {
 | |
| 		vdo_log_debug("Read queue full, waiting for reads to finish");
 | |
| 		uds_wait_cond(&volume->read_threads_read_done_cond,
 | |
| 			      &volume->read_threads_mutex);
 | |
| 	}
 | |
| 
 | |
| 	uds_signal_cond(&volume->read_threads_cond);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Reserve the next read queue entry for processing, but do not actually remove it from the queue.
 | |
|  * Must be followed by release_queued_requests().
 | |
|  */
 | |
| static struct queued_read *reserve_read_queue_entry(struct page_cache *cache)
 | |
| {
 | |
| 	/* We hold the read_threads_mutex. */
 | |
| 	struct queued_read *entry;
 | |
| 	u16 index_value;
 | |
| 	bool queued;
 | |
| 
 | |
| 	/* No items to dequeue */
 | |
| 	if (cache->read_queue_next_read == cache->read_queue_last)
 | |
| 		return NULL;
 | |
| 
 | |
| 	entry = &cache->read_queue[cache->read_queue_next_read];
 | |
| 	index_value = cache->index[entry->physical_page];
 | |
| 	queued = (index_value & VOLUME_CACHE_QUEUED_FLAG) != 0;
 | |
| 	/* Check to see if it's still queued before resetting. */
 | |
| 	if (entry->invalid && queued)
 | |
| 		WRITE_ONCE(cache->index[entry->physical_page], cache->cache_slots);
 | |
| 
 | |
| 	/*
 | |
| 	 * If a synchronous read has taken this page, set invalid to true so it doesn't get
 | |
| 	 * overwritten. Requests will just be requeued.
 | |
| 	 */
 | |
| 	if (!queued)
 | |
| 		entry->invalid = true;
 | |
| 
 | |
| 	entry->reserved = true;
 | |
| 	advance_queue_position(&cache->read_queue_next_read);
 | |
| 	return entry;
 | |
| }
 | |
| 
 | |
| static inline struct queued_read *wait_to_reserve_read_queue_entry(struct volume *volume)
 | |
| {
 | |
| 	struct queued_read *queue_entry = NULL;
 | |
| 
 | |
| 	while (!volume->read_threads_exiting) {
 | |
| 		queue_entry = reserve_read_queue_entry(&volume->page_cache);
 | |
| 		if (queue_entry != NULL)
 | |
| 			break;
 | |
| 
 | |
| 		uds_wait_cond(&volume->read_threads_cond, &volume->read_threads_mutex);
 | |
| 	}
 | |
| 
 | |
| 	return queue_entry;
 | |
| }
 | |
| 
 | |
| static int init_chapter_index_page(const struct volume *volume, u8 *index_page,
 | |
| 				   u32 chapter, u32 index_page_number,
 | |
| 				   struct delta_index_page *chapter_index_page)
 | |
| {
 | |
| 	u64 ci_virtual;
 | |
| 	u32 ci_chapter;
 | |
| 	u32 lowest_list;
 | |
| 	u32 highest_list;
 | |
| 	struct index_geometry *geometry = volume->geometry;
 | |
| 	int result;
 | |
| 
 | |
| 	result = uds_initialize_chapter_index_page(chapter_index_page, geometry,
 | |
| 						   index_page, volume->nonce);
 | |
| 	if (volume->lookup_mode == LOOKUP_FOR_REBUILD)
 | |
| 		return result;
 | |
| 
 | |
| 	if (result != UDS_SUCCESS) {
 | |
| 		return vdo_log_error_strerror(result,
 | |
| 					      "Reading chapter index page for chapter %u page %u",
 | |
| 					      chapter, index_page_number);
 | |
| 	}
 | |
| 
 | |
| 	uds_get_list_number_bounds(volume->index_page_map, chapter, index_page_number,
 | |
| 				   &lowest_list, &highest_list);
 | |
| 	ci_virtual = chapter_index_page->virtual_chapter_number;
 | |
| 	ci_chapter = uds_map_to_physical_chapter(geometry, ci_virtual);
 | |
| 	if ((chapter == ci_chapter) &&
 | |
| 	    (lowest_list == chapter_index_page->lowest_list_number) &&
 | |
| 	    (highest_list == chapter_index_page->highest_list_number))
 | |
| 		return UDS_SUCCESS;
 | |
| 
 | |
| 	vdo_log_warning("Index page map updated to %llu",
 | |
| 			(unsigned long long) volume->index_page_map->last_update);
 | |
| 	vdo_log_warning("Page map expects that chapter %u page %u has range %u to %u, but chapter index page has chapter %llu with range %u to %u",
 | |
| 			chapter, index_page_number, lowest_list, highest_list,
 | |
| 			(unsigned long long) ci_virtual,
 | |
| 			chapter_index_page->lowest_list_number,
 | |
| 			chapter_index_page->highest_list_number);
 | |
| 	return vdo_log_error_strerror(UDS_CORRUPT_DATA,
 | |
| 				      "index page map mismatch with chapter index");
 | |
| }
 | |
| 
 | |
| static int initialize_index_page(const struct volume *volume, u32 physical_page,
 | |
| 				 struct cached_page *page)
 | |
| {
 | |
| 	u32 chapter = map_to_chapter_number(volume->geometry, physical_page);
 | |
| 	u32 index_page_number = map_to_page_number(volume->geometry, physical_page);
 | |
| 
 | |
| 	return init_chapter_index_page(volume, dm_bufio_get_block_data(page->buffer),
 | |
| 				       chapter, index_page_number, &page->index_page);
 | |
| }
 | |
| 
 | |
| static bool search_record_page(const u8 record_page[],
 | |
| 			       const struct uds_record_name *name,
 | |
| 			       const struct index_geometry *geometry,
 | |
| 			       struct uds_record_data *metadata)
 | |
| {
 | |
| 	/*
 | |
| 	 * The array of records is sorted by name and stored as a binary tree in heap order, so the
 | |
| 	 * root of the tree is the first array element.
 | |
| 	 */
 | |
| 	u32 node = 0;
 | |
| 	const struct uds_volume_record *records = (const struct uds_volume_record *) record_page;
 | |
| 
 | |
| 	while (node < geometry->records_per_page) {
 | |
| 		int result;
 | |
| 		const struct uds_volume_record *record = &records[node];
 | |
| 
 | |
| 		result = memcmp(name, &record->name, UDS_RECORD_NAME_SIZE);
 | |
| 		if (result == 0) {
 | |
| 			if (metadata != NULL)
 | |
| 				*metadata = record->data;
 | |
| 			return true;
 | |
| 		}
 | |
| 
 | |
| 		/* The children of node N are at indexes 2N+1 and 2N+2. */
 | |
| 		node = ((2 * node) + ((result < 0) ? 1 : 2));
 | |
| 	}
 | |
| 
 | |
| 	return false;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * If we've read in a record page, we're going to do an immediate search, to speed up processing by
 | |
|  * avoiding get_record_from_zone(), and to ensure that requests make progress even when queued. If
 | |
|  * we've read in an index page, we save the record page number so we don't have to resolve the
 | |
|  * index page again. We use the location, virtual_chapter, and old_metadata fields in the request
 | |
|  * to allow the index code to know where to begin processing the request again.
 | |
|  */
 | |
| static int search_page(struct cached_page *page, const struct volume *volume,
 | |
| 		       struct uds_request *request, u32 physical_page)
 | |
| {
 | |
| 	int result;
 | |
| 	enum uds_index_region location;
 | |
| 	u16 record_page_number;
 | |
| 
 | |
| 	if (is_record_page(volume->geometry, physical_page)) {
 | |
| 		if (search_record_page(dm_bufio_get_block_data(page->buffer),
 | |
| 				       &request->record_name, volume->geometry,
 | |
| 				       &request->old_metadata))
 | |
| 			location = UDS_LOCATION_RECORD_PAGE_LOOKUP;
 | |
| 		else
 | |
| 			location = UDS_LOCATION_UNAVAILABLE;
 | |
| 	} else {
 | |
| 		result = uds_search_chapter_index_page(&page->index_page,
 | |
| 						       volume->geometry,
 | |
| 						       &request->record_name,
 | |
| 						       &record_page_number);
 | |
| 		if (result != UDS_SUCCESS)
 | |
| 			return result;
 | |
| 
 | |
| 		if (record_page_number == NO_CHAPTER_INDEX_ENTRY) {
 | |
| 			location = UDS_LOCATION_UNAVAILABLE;
 | |
| 		} else {
 | |
| 			location = UDS_LOCATION_INDEX_PAGE_LOOKUP;
 | |
| 			*((u16 *) &request->old_metadata) = record_page_number;
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	request->location = location;
 | |
| 	request->found = false;
 | |
| 	return UDS_SUCCESS;
 | |
| }
 | |
| 
 | |
| static int process_entry(struct volume *volume, struct queued_read *entry)
 | |
| {
 | |
| 	u32 page_number = entry->physical_page;
 | |
| 	struct uds_request *request;
 | |
| 	struct cached_page *page = NULL;
 | |
| 	u8 *page_data;
 | |
| 	int result;
 | |
| 
 | |
| 	if (entry->invalid) {
 | |
| 		vdo_log_debug("Requeuing requests for invalid page");
 | |
| 		return UDS_SUCCESS;
 | |
| 	}
 | |
| 
 | |
| 	page = select_victim_in_cache(&volume->page_cache);
 | |
| 
 | |
| 	mutex_unlock(&volume->read_threads_mutex);
 | |
| 	page_data = dm_bufio_read(volume->client, page_number, &page->buffer);
 | |
| 	mutex_lock(&volume->read_threads_mutex);
 | |
| 	if (IS_ERR(page_data)) {
 | |
| 		result = -PTR_ERR(page_data);
 | |
| 		vdo_log_warning_strerror(result,
 | |
| 					 "error reading physical page %u from volume",
 | |
| 					 page_number);
 | |
| 		cancel_page_in_cache(&volume->page_cache, page_number, page);
 | |
| 		return result;
 | |
| 	}
 | |
| 
 | |
| 	if (entry->invalid) {
 | |
| 		vdo_log_warning("Page %u invalidated after read", page_number);
 | |
| 		cancel_page_in_cache(&volume->page_cache, page_number, page);
 | |
| 		return UDS_SUCCESS;
 | |
| 	}
 | |
| 
 | |
| 	if (!is_record_page(volume->geometry, page_number)) {
 | |
| 		result = initialize_index_page(volume, page_number, page);
 | |
| 		if (result != UDS_SUCCESS) {
 | |
| 			vdo_log_warning("Error initializing chapter index page");
 | |
| 			cancel_page_in_cache(&volume->page_cache, page_number, page);
 | |
| 			return result;
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	result = put_page_in_cache(&volume->page_cache, page_number, page);
 | |
| 	if (result != UDS_SUCCESS) {
 | |
| 		vdo_log_warning("Error putting page %u in cache", page_number);
 | |
| 		cancel_page_in_cache(&volume->page_cache, page_number, page);
 | |
| 		return result;
 | |
| 	}
 | |
| 
 | |
| 	request = entry->first_request;
 | |
| 	while ((request != NULL) && (result == UDS_SUCCESS)) {
 | |
| 		result = search_page(page, volume, request, page_number);
 | |
| 		request = request->next_request;
 | |
| 	}
 | |
| 
 | |
| 	return result;
 | |
| }
 | |
| 
 | |
| static void release_queued_requests(struct volume *volume, struct queued_read *entry,
 | |
| 				    int result)
 | |
| {
 | |
| 	struct page_cache *cache = &volume->page_cache;
 | |
| 	u16 next_read = cache->read_queue_next_read;
 | |
| 	struct uds_request *request;
 | |
| 	struct uds_request *next;
 | |
| 
 | |
| 	for (request = entry->first_request; request != NULL; request = next) {
 | |
| 		next = request->next_request;
 | |
| 		request->status = result;
 | |
| 		request->requeued = true;
 | |
| 		uds_enqueue_request(request, STAGE_INDEX);
 | |
| 	}
 | |
| 
 | |
| 	entry->reserved = false;
 | |
| 
 | |
| 	/* Move the read_queue_first pointer as far as we can. */
 | |
| 	while ((cache->read_queue_first != next_read) &&
 | |
| 	       (!cache->read_queue[cache->read_queue_first].reserved))
 | |
| 		advance_queue_position(&cache->read_queue_first);
 | |
| 	uds_broadcast_cond(&volume->read_threads_read_done_cond);
 | |
| }
 | |
| 
 | |
| static void read_thread_function(void *arg)
 | |
| {
 | |
| 	struct volume *volume = arg;
 | |
| 
 | |
| 	vdo_log_debug("reader starting");
 | |
| 	mutex_lock(&volume->read_threads_mutex);
 | |
| 	while (true) {
 | |
| 		struct queued_read *queue_entry;
 | |
| 		int result;
 | |
| 
 | |
| 		queue_entry = wait_to_reserve_read_queue_entry(volume);
 | |
| 		if (volume->read_threads_exiting)
 | |
| 			break;
 | |
| 
 | |
| 		result = process_entry(volume, queue_entry);
 | |
| 		release_queued_requests(volume, queue_entry, result);
 | |
| 	}
 | |
| 	mutex_unlock(&volume->read_threads_mutex);
 | |
| 	vdo_log_debug("reader done");
 | |
| }
 | |
| 
 | |
| static void get_page_and_index(struct page_cache *cache, u32 physical_page,
 | |
| 			       int *queue_index, struct cached_page **page_ptr)
 | |
| {
 | |
| 	u16 index_value;
 | |
| 	u16 index;
 | |
| 	bool queued;
 | |
| 
 | |
| 	/*
 | |
| 	 * ASSERTION: We are either a zone thread holding a search_pending_counter, or we are any
 | |
| 	 * thread holding the read_threads_mutex.
 | |
| 	 *
 | |
| 	 * Holding only a search_pending_counter is the most frequent case.
 | |
| 	 */
 | |
| 	/*
 | |
| 	 * It would be unlikely for the compiler to turn the usage of index_value into two reads of
 | |
| 	 * cache->index, but it would be possible and very bad if those reads did not return the
 | |
| 	 * same bits.
 | |
| 	 */
 | |
| 	index_value = READ_ONCE(cache->index[physical_page]);
 | |
| 	queued = (index_value & VOLUME_CACHE_QUEUED_FLAG) != 0;
 | |
| 	index = index_value & ~VOLUME_CACHE_QUEUED_FLAG;
 | |
| 
 | |
| 	if (!queued && (index < cache->cache_slots)) {
 | |
| 		*page_ptr = &cache->cache[index];
 | |
| 		/*
 | |
| 		 * We have acquired access to the cached page, but unless we hold the
 | |
| 		 * read_threads_mutex, we need a read memory barrier now. The corresponding write
 | |
| 		 * memory barrier is in put_page_in_cache().
 | |
| 		 */
 | |
| 		smp_rmb();
 | |
| 	} else {
 | |
| 		*page_ptr = NULL;
 | |
| 	}
 | |
| 
 | |
| 	*queue_index = queued ? index : -1;
 | |
| }
 | |
| 
 | |
| static void get_page_from_cache(struct page_cache *cache, u32 physical_page,
 | |
| 				struct cached_page **page)
 | |
| {
 | |
| 	/*
 | |
| 	 * ASSERTION: We are in a zone thread.
 | |
| 	 * ASSERTION: We holding a search_pending_counter or the read_threads_mutex.
 | |
| 	 */
 | |
| 	int queue_index = -1;
 | |
| 
 | |
| 	get_page_and_index(cache, physical_page, &queue_index, page);
 | |
| }
 | |
| 
 | |
| static int read_page_locked(struct volume *volume, u32 physical_page,
 | |
| 			    struct cached_page **page_ptr)
 | |
| {
 | |
| 	int result = UDS_SUCCESS;
 | |
| 	struct cached_page *page = NULL;
 | |
| 	u8 *page_data;
 | |
| 
 | |
| 	page = select_victim_in_cache(&volume->page_cache);
 | |
| 	page_data = dm_bufio_read(volume->client, physical_page, &page->buffer);
 | |
| 	if (IS_ERR(page_data)) {
 | |
| 		result = -PTR_ERR(page_data);
 | |
| 		vdo_log_warning_strerror(result,
 | |
| 					 "error reading physical page %u from volume",
 | |
| 					 physical_page);
 | |
| 		cancel_page_in_cache(&volume->page_cache, physical_page, page);
 | |
| 		return result;
 | |
| 	}
 | |
| 
 | |
| 	if (!is_record_page(volume->geometry, physical_page)) {
 | |
| 		result = initialize_index_page(volume, physical_page, page);
 | |
| 		if (result != UDS_SUCCESS) {
 | |
| 			if (volume->lookup_mode != LOOKUP_FOR_REBUILD)
 | |
| 				vdo_log_warning("Corrupt index page %u", physical_page);
 | |
| 			cancel_page_in_cache(&volume->page_cache, physical_page, page);
 | |
| 			return result;
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	result = put_page_in_cache(&volume->page_cache, physical_page, page);
 | |
| 	if (result != UDS_SUCCESS) {
 | |
| 		vdo_log_warning("Error putting page %u in cache", physical_page);
 | |
| 		cancel_page_in_cache(&volume->page_cache, physical_page, page);
 | |
| 		return result;
 | |
| 	}
 | |
| 
 | |
| 	*page_ptr = page;
 | |
| 	return UDS_SUCCESS;
 | |
| }
 | |
| 
 | |
| /* Retrieve a page from the cache while holding the read threads mutex. */
 | |
| static int get_volume_page_locked(struct volume *volume, u32 physical_page,
 | |
| 				  struct cached_page **page_ptr)
 | |
| {
 | |
| 	int result;
 | |
| 	struct cached_page *page = NULL;
 | |
| 
 | |
| 	get_page_from_cache(&volume->page_cache, physical_page, &page);
 | |
| 	if (page == NULL) {
 | |
| 		result = read_page_locked(volume, physical_page, &page);
 | |
| 		if (result != UDS_SUCCESS)
 | |
| 			return result;
 | |
| 	} else {
 | |
| 		make_page_most_recent(&volume->page_cache, page);
 | |
| 	}
 | |
| 
 | |
| 	*page_ptr = page;
 | |
| 	return UDS_SUCCESS;
 | |
| }
 | |
| 
 | |
| /* Retrieve a page from the cache while holding a search_pending lock. */
 | |
| static int get_volume_page_protected(struct volume *volume, struct uds_request *request,
 | |
| 				     u32 physical_page, struct cached_page **page_ptr)
 | |
| {
 | |
| 	struct cached_page *page;
 | |
| 
 | |
| 	get_page_from_cache(&volume->page_cache, physical_page, &page);
 | |
| 	if (page != NULL) {
 | |
| 		if (request->zone_number == 0) {
 | |
| 			/* Only one zone is allowed to update the LRU. */
 | |
| 			make_page_most_recent(&volume->page_cache, page);
 | |
| 		}
 | |
| 
 | |
| 		*page_ptr = page;
 | |
| 		return UDS_SUCCESS;
 | |
| 	}
 | |
| 
 | |
| 	/* Prepare to enqueue a read for the page. */
 | |
| 	end_pending_search(&volume->page_cache, request->zone_number);
 | |
| 	mutex_lock(&volume->read_threads_mutex);
 | |
| 
 | |
| 	/*
 | |
| 	 * Do the lookup again while holding the read mutex (no longer the fast case so this should
 | |
| 	 * be fine to repeat). We need to do this because a page may have been added to the cache
 | |
| 	 * by a reader thread between the time we searched above and the time we went to actually
 | |
| 	 * try to enqueue it below. This could result in us enqueuing another read for a page which
 | |
| 	 * is already in the cache, which would mean we end up with two entries in the cache for
 | |
| 	 * the same page.
 | |
| 	 */
 | |
| 	get_page_from_cache(&volume->page_cache, physical_page, &page);
 | |
| 	if (page == NULL) {
 | |
| 		enqueue_page_read(volume, request, physical_page);
 | |
| 		/*
 | |
| 		 * The performance gain from unlocking first, while "search pending" mode is off,
 | |
| 		 * turns out to be significant in some cases. The page is not available yet so
 | |
| 		 * the order does not matter for correctness as it does below.
 | |
| 		 */
 | |
| 		mutex_unlock(&volume->read_threads_mutex);
 | |
| 		begin_pending_search(&volume->page_cache, physical_page,
 | |
| 				     request->zone_number);
 | |
| 		return UDS_QUEUED;
 | |
| 	}
 | |
| 
 | |
| 	/*
 | |
| 	 * Now that the page is loaded, the volume needs to switch to "reader thread unlocked" and
 | |
| 	 * "search pending" state in careful order so no other thread can mess with the data before
 | |
| 	 * the caller gets to look at it.
 | |
| 	 */
 | |
| 	begin_pending_search(&volume->page_cache, physical_page, request->zone_number);
 | |
| 	mutex_unlock(&volume->read_threads_mutex);
 | |
| 	*page_ptr = page;
 | |
| 	return UDS_SUCCESS;
 | |
| }
 | |
| 
 | |
| static int get_volume_page(struct volume *volume, u32 chapter, u32 page_number,
 | |
| 			   struct cached_page **page_ptr)
 | |
| {
 | |
| 	int result;
 | |
| 	u32 physical_page = map_to_physical_page(volume->geometry, chapter, page_number);
 | |
| 
 | |
| 	mutex_lock(&volume->read_threads_mutex);
 | |
| 	result = get_volume_page_locked(volume, physical_page, page_ptr);
 | |
| 	mutex_unlock(&volume->read_threads_mutex);
 | |
| 	return result;
 | |
| }
 | |
| 
 | |
| int uds_get_volume_record_page(struct volume *volume, u32 chapter, u32 page_number,
 | |
| 			       u8 **data_ptr)
 | |
| {
 | |
| 	int result;
 | |
| 	struct cached_page *page = NULL;
 | |
| 
 | |
| 	result = get_volume_page(volume, chapter, page_number, &page);
 | |
| 	if (result == UDS_SUCCESS)
 | |
| 		*data_ptr = dm_bufio_get_block_data(page->buffer);
 | |
| 	return result;
 | |
| }
 | |
| 
 | |
| int uds_get_volume_index_page(struct volume *volume, u32 chapter, u32 page_number,
 | |
| 			      struct delta_index_page **index_page_ptr)
 | |
| {
 | |
| 	int result;
 | |
| 	struct cached_page *page = NULL;
 | |
| 
 | |
| 	result = get_volume_page(volume, chapter, page_number, &page);
 | |
| 	if (result == UDS_SUCCESS)
 | |
| 		*index_page_ptr = &page->index_page;
 | |
| 	return result;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Find the record page associated with a name in a given index page. This will return UDS_QUEUED
 | |
|  * if the page in question must be read from storage.
 | |
|  */
 | |
| static int search_cached_index_page(struct volume *volume, struct uds_request *request,
 | |
| 				    u32 chapter, u32 index_page_number,
 | |
| 				    u16 *record_page_number)
 | |
| {
 | |
| 	int result;
 | |
| 	struct cached_page *page = NULL;
 | |
| 	u32 physical_page = map_to_physical_page(volume->geometry, chapter,
 | |
| 						 index_page_number);
 | |
| 
 | |
| 	/*
 | |
| 	 * Make sure the invalidate counter is updated before we try and read the mapping. This
 | |
| 	 * prevents this thread from reading a page in the cache which has already been marked for
 | |
| 	 * invalidation by the reader thread, before the reader thread has noticed that the
 | |
| 	 * invalidate_counter has been incremented.
 | |
| 	 */
 | |
| 	begin_pending_search(&volume->page_cache, physical_page, request->zone_number);
 | |
| 
 | |
| 	result = get_volume_page_protected(volume, request, physical_page, &page);
 | |
| 	if (result != UDS_SUCCESS) {
 | |
| 		end_pending_search(&volume->page_cache, request->zone_number);
 | |
| 		return result;
 | |
| 	}
 | |
| 
 | |
| 	result = uds_search_chapter_index_page(&page->index_page, volume->geometry,
 | |
| 					       &request->record_name,
 | |
| 					       record_page_number);
 | |
| 	end_pending_search(&volume->page_cache, request->zone_number);
 | |
| 	return result;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Find the metadata associated with a name in a given record page. This will return UDS_QUEUED if
 | |
|  * the page in question must be read from storage.
 | |
|  */
 | |
| int uds_search_cached_record_page(struct volume *volume, struct uds_request *request,
 | |
| 				  u32 chapter, u16 record_page_number, bool *found)
 | |
| {
 | |
| 	struct cached_page *record_page;
 | |
| 	struct index_geometry *geometry = volume->geometry;
 | |
| 	int result;
 | |
| 	u32 physical_page, page_number;
 | |
| 
 | |
| 	*found = false;
 | |
| 	if (record_page_number == NO_CHAPTER_INDEX_ENTRY)
 | |
| 		return UDS_SUCCESS;
 | |
| 
 | |
| 	result = VDO_ASSERT(record_page_number < geometry->record_pages_per_chapter,
 | |
| 			    "0 <= %d < %u", record_page_number,
 | |
| 			    geometry->record_pages_per_chapter);
 | |
| 	if (result != VDO_SUCCESS)
 | |
| 		return result;
 | |
| 
 | |
| 	page_number = geometry->index_pages_per_chapter + record_page_number;
 | |
| 
 | |
| 	physical_page = map_to_physical_page(volume->geometry, chapter, page_number);
 | |
| 
 | |
| 	/*
 | |
| 	 * Make sure the invalidate counter is updated before we try and read the mapping. This
 | |
| 	 * prevents this thread from reading a page in the cache which has already been marked for
 | |
| 	 * invalidation by the reader thread, before the reader thread has noticed that the
 | |
| 	 * invalidate_counter has been incremented.
 | |
| 	 */
 | |
| 	begin_pending_search(&volume->page_cache, physical_page, request->zone_number);
 | |
| 
 | |
| 	result = get_volume_page_protected(volume, request, physical_page, &record_page);
 | |
| 	if (result != UDS_SUCCESS) {
 | |
| 		end_pending_search(&volume->page_cache, request->zone_number);
 | |
| 		return result;
 | |
| 	}
 | |
| 
 | |
| 	if (search_record_page(dm_bufio_get_block_data(record_page->buffer),
 | |
| 			       &request->record_name, geometry, &request->old_metadata))
 | |
| 		*found = true;
 | |
| 
 | |
| 	end_pending_search(&volume->page_cache, request->zone_number);
 | |
| 	return UDS_SUCCESS;
 | |
| }
 | |
| 
 | |
| void uds_prefetch_volume_chapter(const struct volume *volume, u32 chapter)
 | |
| {
 | |
| 	const struct index_geometry *geometry = volume->geometry;
 | |
| 	u32 physical_page = map_to_physical_page(geometry, chapter, 0);
 | |
| 
 | |
| 	dm_bufio_prefetch(volume->client, physical_page, geometry->pages_per_chapter);
 | |
| }
 | |
| 
 | |
| int uds_read_chapter_index_from_volume(const struct volume *volume, u64 virtual_chapter,
 | |
| 				       struct dm_buffer *volume_buffers[],
 | |
| 				       struct delta_index_page index_pages[])
 | |
| {
 | |
| 	int result;
 | |
| 	u32 i;
 | |
| 	const struct index_geometry *geometry = volume->geometry;
 | |
| 	u32 physical_chapter = uds_map_to_physical_chapter(geometry, virtual_chapter);
 | |
| 	u32 physical_page = map_to_physical_page(geometry, physical_chapter, 0);
 | |
| 
 | |
| 	dm_bufio_prefetch(volume->client, physical_page, geometry->index_pages_per_chapter);
 | |
| 	for (i = 0; i < geometry->index_pages_per_chapter; i++) {
 | |
| 		u8 *index_page;
 | |
| 
 | |
| 		index_page = dm_bufio_read(volume->client, physical_page + i,
 | |
| 					   &volume_buffers[i]);
 | |
| 		if (IS_ERR(index_page)) {
 | |
| 			result = -PTR_ERR(index_page);
 | |
| 			vdo_log_warning_strerror(result,
 | |
| 						 "error reading physical page %u",
 | |
| 						 physical_page);
 | |
| 			return result;
 | |
| 		}
 | |
| 
 | |
| 		result = init_chapter_index_page(volume, index_page, physical_chapter, i,
 | |
| 						 &index_pages[i]);
 | |
| 		if (result != UDS_SUCCESS)
 | |
| 			return result;
 | |
| 	}
 | |
| 
 | |
| 	return UDS_SUCCESS;
 | |
| }
 | |
| 
 | |
| int uds_search_volume_page_cache(struct volume *volume, struct uds_request *request,
 | |
| 				 bool *found)
 | |
| {
 | |
| 	int result;
 | |
| 	u32 physical_chapter =
 | |
| 		uds_map_to_physical_chapter(volume->geometry, request->virtual_chapter);
 | |
| 	u32 index_page_number;
 | |
| 	u16 record_page_number;
 | |
| 
 | |
| 	index_page_number = uds_find_index_page_number(volume->index_page_map,
 | |
| 						       &request->record_name,
 | |
| 						       physical_chapter);
 | |
| 
 | |
| 	if (request->location == UDS_LOCATION_INDEX_PAGE_LOOKUP) {
 | |
| 		record_page_number = *((u16 *) &request->old_metadata);
 | |
| 	} else {
 | |
| 		result = search_cached_index_page(volume, request, physical_chapter,
 | |
| 						  index_page_number,
 | |
| 						  &record_page_number);
 | |
| 		if (result != UDS_SUCCESS)
 | |
| 			return result;
 | |
| 	}
 | |
| 
 | |
| 	return uds_search_cached_record_page(volume, request, physical_chapter,
 | |
| 					     record_page_number, found);
 | |
| }
 | |
| 
 | |
| int uds_search_volume_page_cache_for_rebuild(struct volume *volume,
 | |
| 					     const struct uds_record_name *name,
 | |
| 					     u64 virtual_chapter, bool *found)
 | |
| {
 | |
| 	int result;
 | |
| 	struct index_geometry *geometry = volume->geometry;
 | |
| 	struct cached_page *page;
 | |
| 	u32 physical_chapter = uds_map_to_physical_chapter(geometry, virtual_chapter);
 | |
| 	u32 index_page_number;
 | |
| 	u16 record_page_number;
 | |
| 	u32 page_number;
 | |
| 
 | |
| 	*found = false;
 | |
| 	index_page_number =
 | |
| 		uds_find_index_page_number(volume->index_page_map, name,
 | |
| 					   physical_chapter);
 | |
| 	result = get_volume_page(volume, physical_chapter, index_page_number, &page);
 | |
| 	if (result != UDS_SUCCESS)
 | |
| 		return result;
 | |
| 
 | |
| 	result = uds_search_chapter_index_page(&page->index_page, geometry, name,
 | |
| 					       &record_page_number);
 | |
| 	if (result != UDS_SUCCESS)
 | |
| 		return result;
 | |
| 
 | |
| 	if (record_page_number == NO_CHAPTER_INDEX_ENTRY)
 | |
| 		return UDS_SUCCESS;
 | |
| 
 | |
| 	page_number = geometry->index_pages_per_chapter + record_page_number;
 | |
| 	result = get_volume_page(volume, physical_chapter, page_number, &page);
 | |
| 	if (result != UDS_SUCCESS)
 | |
| 		return result;
 | |
| 
 | |
| 	*found = search_record_page(dm_bufio_get_block_data(page->buffer), name,
 | |
| 				    geometry, NULL);
 | |
| 	return UDS_SUCCESS;
 | |
| }
 | |
| 
 | |
| static void invalidate_page(struct page_cache *cache, u32 physical_page)
 | |
| {
 | |
| 	struct cached_page *page;
 | |
| 	int queue_index = -1;
 | |
| 
 | |
| 	/* We hold the read_threads_mutex. */
 | |
| 	get_page_and_index(cache, physical_page, &queue_index, &page);
 | |
| 	if (page != NULL) {
 | |
| 		WRITE_ONCE(cache->index[page->physical_page], cache->cache_slots);
 | |
| 		wait_for_pending_searches(cache, page->physical_page);
 | |
| 		clear_cache_page(cache, page);
 | |
| 	} else if (queue_index > -1) {
 | |
| 		vdo_log_debug("setting pending read to invalid");
 | |
| 		cache->read_queue[queue_index].invalid = true;
 | |
| 	}
 | |
| }
 | |
| 
 | |
| void uds_forget_chapter(struct volume *volume, u64 virtual_chapter)
 | |
| {
 | |
| 	u32 physical_chapter =
 | |
| 		uds_map_to_physical_chapter(volume->geometry, virtual_chapter);
 | |
| 	u32 first_page = map_to_physical_page(volume->geometry, physical_chapter, 0);
 | |
| 	u32 i;
 | |
| 
 | |
| 	vdo_log_debug("forgetting chapter %llu", (unsigned long long) virtual_chapter);
 | |
| 	mutex_lock(&volume->read_threads_mutex);
 | |
| 	for (i = 0; i < volume->geometry->pages_per_chapter; i++)
 | |
| 		invalidate_page(&volume->page_cache, first_page + i);
 | |
| 	mutex_unlock(&volume->read_threads_mutex);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Donate an index pages from a newly written chapter to the page cache since it is likely to be
 | |
|  * used again soon. The caller must already hold the reader thread mutex.
 | |
|  */
 | |
| static int donate_index_page_locked(struct volume *volume, u32 physical_chapter,
 | |
| 				    u32 index_page_number, struct dm_buffer *page_buffer)
 | |
| {
 | |
| 	int result;
 | |
| 	struct cached_page *page = NULL;
 | |
| 	u32 physical_page =
 | |
| 		map_to_physical_page(volume->geometry, physical_chapter,
 | |
| 				     index_page_number);
 | |
| 
 | |
| 	page = select_victim_in_cache(&volume->page_cache);
 | |
| 	page->buffer = page_buffer;
 | |
| 	result = init_chapter_index_page(volume, dm_bufio_get_block_data(page_buffer),
 | |
| 					 physical_chapter, index_page_number,
 | |
| 					 &page->index_page);
 | |
| 	if (result != UDS_SUCCESS) {
 | |
| 		vdo_log_warning("Error initialize chapter index page");
 | |
| 		cancel_page_in_cache(&volume->page_cache, physical_page, page);
 | |
| 		return result;
 | |
| 	}
 | |
| 
 | |
| 	result = put_page_in_cache(&volume->page_cache, physical_page, page);
 | |
| 	if (result != UDS_SUCCESS) {
 | |
| 		vdo_log_warning("Error putting page %u in cache", physical_page);
 | |
| 		cancel_page_in_cache(&volume->page_cache, physical_page, page);
 | |
| 		return result;
 | |
| 	}
 | |
| 
 | |
| 	return UDS_SUCCESS;
 | |
| }
 | |
| 
 | |
| static int write_index_pages(struct volume *volume, u32 physical_chapter_number,
 | |
| 			     struct open_chapter_index *chapter_index)
 | |
| {
 | |
| 	struct index_geometry *geometry = volume->geometry;
 | |
| 	struct dm_buffer *page_buffer;
 | |
| 	u32 first_index_page = map_to_physical_page(geometry, physical_chapter_number, 0);
 | |
| 	u32 delta_list_number = 0;
 | |
| 	u32 index_page_number;
 | |
| 
 | |
| 	for (index_page_number = 0;
 | |
| 	     index_page_number < geometry->index_pages_per_chapter;
 | |
| 	     index_page_number++) {
 | |
| 		u8 *page_data;
 | |
| 		u32 physical_page = first_index_page + index_page_number;
 | |
| 		u32 lists_packed;
 | |
| 		bool last_page;
 | |
| 		int result;
 | |
| 
 | |
| 		page_data = dm_bufio_new(volume->client, physical_page, &page_buffer);
 | |
| 		if (IS_ERR(page_data)) {
 | |
| 			return vdo_log_warning_strerror(-PTR_ERR(page_data),
 | |
| 							"failed to prepare index page");
 | |
| 		}
 | |
| 
 | |
| 		last_page = ((index_page_number + 1) == geometry->index_pages_per_chapter);
 | |
| 		result = uds_pack_open_chapter_index_page(chapter_index, page_data,
 | |
| 							  delta_list_number, last_page,
 | |
| 							  &lists_packed);
 | |
| 		if (result != UDS_SUCCESS) {
 | |
| 			dm_bufio_release(page_buffer);
 | |
| 			return vdo_log_warning_strerror(result,
 | |
| 							"failed to pack index page");
 | |
| 		}
 | |
| 
 | |
| 		dm_bufio_mark_buffer_dirty(page_buffer);
 | |
| 
 | |
| 		if (lists_packed == 0) {
 | |
| 			vdo_log_debug("no delta lists packed on chapter %u page %u",
 | |
| 				      physical_chapter_number, index_page_number);
 | |
| 		} else {
 | |
| 			delta_list_number += lists_packed;
 | |
| 		}
 | |
| 
 | |
| 		uds_update_index_page_map(volume->index_page_map,
 | |
| 					  chapter_index->virtual_chapter_number,
 | |
| 					  physical_chapter_number, index_page_number,
 | |
| 					  delta_list_number - 1);
 | |
| 
 | |
| 		mutex_lock(&volume->read_threads_mutex);
 | |
| 		result = donate_index_page_locked(volume, physical_chapter_number,
 | |
| 						  index_page_number, page_buffer);
 | |
| 		mutex_unlock(&volume->read_threads_mutex);
 | |
| 		if (result != UDS_SUCCESS) {
 | |
| 			dm_bufio_release(page_buffer);
 | |
| 			return result;
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return UDS_SUCCESS;
 | |
| }
 | |
| 
 | |
| static u32 encode_tree(u8 record_page[],
 | |
| 		       const struct uds_volume_record *sorted_pointers[],
 | |
| 		       u32 next_record, u32 node, u32 node_count)
 | |
| {
 | |
| 	if (node < node_count) {
 | |
| 		u32 child = (2 * node) + 1;
 | |
| 
 | |
| 		next_record = encode_tree(record_page, sorted_pointers, next_record,
 | |
| 					  child, node_count);
 | |
| 
 | |
| 		/*
 | |
| 		 * In-order traversal: copy the contents of the next record into the page at the
 | |
| 		 * node offset.
 | |
| 		 */
 | |
| 		memcpy(&record_page[node * BYTES_PER_RECORD],
 | |
| 		       sorted_pointers[next_record++], BYTES_PER_RECORD);
 | |
| 
 | |
| 		next_record = encode_tree(record_page, sorted_pointers, next_record,
 | |
| 					  child + 1, node_count);
 | |
| 	}
 | |
| 
 | |
| 	return next_record;
 | |
| }
 | |
| 
 | |
| static int encode_record_page(const struct volume *volume,
 | |
| 			      const struct uds_volume_record records[], u8 record_page[])
 | |
| {
 | |
| 	int result;
 | |
| 	u32 i;
 | |
| 	u32 records_per_page = volume->geometry->records_per_page;
 | |
| 	const struct uds_volume_record **record_pointers = volume->record_pointers;
 | |
| 
 | |
| 	for (i = 0; i < records_per_page; i++)
 | |
| 		record_pointers[i] = &records[i];
 | |
| 
 | |
| 	/*
 | |
| 	 * Sort the record pointers by using just the names in the records, which is less work than
 | |
| 	 * sorting the entire record values.
 | |
| 	 */
 | |
| 	BUILD_BUG_ON(offsetof(struct uds_volume_record, name) != 0);
 | |
| 	result = uds_radix_sort(volume->radix_sorter, (const u8 **) record_pointers,
 | |
| 				records_per_page, UDS_RECORD_NAME_SIZE);
 | |
| 	if (result != UDS_SUCCESS)
 | |
| 		return result;
 | |
| 
 | |
| 	encode_tree(record_page, record_pointers, 0, 0, records_per_page);
 | |
| 	return UDS_SUCCESS;
 | |
| }
 | |
| 
 | |
| static int write_record_pages(struct volume *volume, u32 physical_chapter_number,
 | |
| 			      const struct uds_volume_record *records)
 | |
| {
 | |
| 	u32 record_page_number;
 | |
| 	struct index_geometry *geometry = volume->geometry;
 | |
| 	struct dm_buffer *page_buffer;
 | |
| 	const struct uds_volume_record *next_record = records;
 | |
| 	u32 first_record_page = map_to_physical_page(geometry, physical_chapter_number,
 | |
| 						     geometry->index_pages_per_chapter);
 | |
| 
 | |
| 	for (record_page_number = 0;
 | |
| 	     record_page_number < geometry->record_pages_per_chapter;
 | |
| 	     record_page_number++) {
 | |
| 		u8 *page_data;
 | |
| 		u32 physical_page = first_record_page + record_page_number;
 | |
| 		int result;
 | |
| 
 | |
| 		page_data = dm_bufio_new(volume->client, physical_page, &page_buffer);
 | |
| 		if (IS_ERR(page_data)) {
 | |
| 			return vdo_log_warning_strerror(-PTR_ERR(page_data),
 | |
| 							"failed to prepare record page");
 | |
| 		}
 | |
| 
 | |
| 		result = encode_record_page(volume, next_record, page_data);
 | |
| 		if (result != UDS_SUCCESS) {
 | |
| 			dm_bufio_release(page_buffer);
 | |
| 			return vdo_log_warning_strerror(result,
 | |
| 							"failed to encode record page %u",
 | |
| 							record_page_number);
 | |
| 		}
 | |
| 
 | |
| 		next_record += geometry->records_per_page;
 | |
| 		dm_bufio_mark_buffer_dirty(page_buffer);
 | |
| 		dm_bufio_release(page_buffer);
 | |
| 	}
 | |
| 
 | |
| 	return UDS_SUCCESS;
 | |
| }
 | |
| 
 | |
| int uds_write_chapter(struct volume *volume, struct open_chapter_index *chapter_index,
 | |
| 		      const struct uds_volume_record *records)
 | |
| {
 | |
| 	int result;
 | |
| 	u32 physical_chapter_number =
 | |
| 		uds_map_to_physical_chapter(volume->geometry,
 | |
| 					    chapter_index->virtual_chapter_number);
 | |
| 
 | |
| 	result = write_index_pages(volume, physical_chapter_number, chapter_index);
 | |
| 	if (result != UDS_SUCCESS)
 | |
| 		return result;
 | |
| 
 | |
| 	result = write_record_pages(volume, physical_chapter_number, records);
 | |
| 	if (result != UDS_SUCCESS)
 | |
| 		return result;
 | |
| 
 | |
| 	result = -dm_bufio_write_dirty_buffers(volume->client);
 | |
| 	if (result != UDS_SUCCESS)
 | |
| 		vdo_log_error_strerror(result, "cannot sync chapter to volume");
 | |
| 
 | |
| 	return result;
 | |
| }
 | |
| 
 | |
| static void probe_chapter(struct volume *volume, u32 chapter_number,
 | |
| 			  u64 *virtual_chapter_number)
 | |
| {
 | |
| 	const struct index_geometry *geometry = volume->geometry;
 | |
| 	u32 expected_list_number = 0;
 | |
| 	u32 i;
 | |
| 	u64 vcn = BAD_CHAPTER;
 | |
| 
 | |
| 	*virtual_chapter_number = BAD_CHAPTER;
 | |
| 	dm_bufio_prefetch(volume->client,
 | |
| 			  map_to_physical_page(geometry, chapter_number, 0),
 | |
| 			  geometry->index_pages_per_chapter);
 | |
| 
 | |
| 	for (i = 0; i < geometry->index_pages_per_chapter; i++) {
 | |
| 		struct delta_index_page *page;
 | |
| 		int result;
 | |
| 
 | |
| 		result = uds_get_volume_index_page(volume, chapter_number, i, &page);
 | |
| 		if (result != UDS_SUCCESS)
 | |
| 			return;
 | |
| 
 | |
| 		if (page->virtual_chapter_number == BAD_CHAPTER) {
 | |
| 			vdo_log_error("corrupt index page in chapter %u",
 | |
| 				      chapter_number);
 | |
| 			return;
 | |
| 		}
 | |
| 
 | |
| 		if (vcn == BAD_CHAPTER) {
 | |
| 			vcn = page->virtual_chapter_number;
 | |
| 		} else if (page->virtual_chapter_number != vcn) {
 | |
| 			vdo_log_error("inconsistent chapter %u index page %u: expected vcn %llu, got vcn %llu",
 | |
| 				      chapter_number, i, (unsigned long long) vcn,
 | |
| 				      (unsigned long long) page->virtual_chapter_number);
 | |
| 			return;
 | |
| 		}
 | |
| 
 | |
| 		if (expected_list_number != page->lowest_list_number) {
 | |
| 			vdo_log_error("inconsistent chapter %u index page %u: expected list number %u, got list number %u",
 | |
| 				      chapter_number, i, expected_list_number,
 | |
| 				      page->lowest_list_number);
 | |
| 			return;
 | |
| 		}
 | |
| 		expected_list_number = page->highest_list_number + 1;
 | |
| 
 | |
| 		result = uds_validate_chapter_index_page(page, geometry);
 | |
| 		if (result != UDS_SUCCESS)
 | |
| 			return;
 | |
| 	}
 | |
| 
 | |
| 	if (chapter_number != uds_map_to_physical_chapter(geometry, vcn)) {
 | |
| 		vdo_log_error("chapter %u vcn %llu is out of phase (%u)", chapter_number,
 | |
| 			      (unsigned long long) vcn, geometry->chapters_per_volume);
 | |
| 		return;
 | |
| 	}
 | |
| 
 | |
| 	*virtual_chapter_number = vcn;
 | |
| }
 | |
| 
 | |
| /* Find the last valid physical chapter in the volume. */
 | |
| static void find_real_end_of_volume(struct volume *volume, u32 limit, u32 *limit_ptr)
 | |
| {
 | |
| 	u32 span = 1;
 | |
| 	u32 tries = 0;
 | |
| 
 | |
| 	while (limit > 0) {
 | |
| 		u32 chapter = (span > limit) ? 0 : limit - span;
 | |
| 		u64 vcn = 0;
 | |
| 
 | |
| 		probe_chapter(volume, chapter, &vcn);
 | |
| 		if (vcn == BAD_CHAPTER) {
 | |
| 			limit = chapter;
 | |
| 			if (++tries > 1)
 | |
| 				span *= 2;
 | |
| 		} else {
 | |
| 			if (span == 1)
 | |
| 				break;
 | |
| 			span /= 2;
 | |
| 			tries = 0;
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	*limit_ptr = limit;
 | |
| }
 | |
| 
 | |
| static int find_chapter_limits(struct volume *volume, u32 chapter_limit, u64 *lowest_vcn,
 | |
| 			       u64 *highest_vcn)
 | |
| {
 | |
| 	struct index_geometry *geometry = volume->geometry;
 | |
| 	u64 zero_vcn;
 | |
| 	u64 lowest = BAD_CHAPTER;
 | |
| 	u64 highest = BAD_CHAPTER;
 | |
| 	u64 moved_chapter = BAD_CHAPTER;
 | |
| 	u32 left_chapter = 0;
 | |
| 	u32 right_chapter = 0;
 | |
| 	u32 bad_chapters = 0;
 | |
| 
 | |
| 	/*
 | |
| 	 * This method assumes there is at most one run of contiguous bad chapters caused by
 | |
| 	 * unflushed writes. Either the bad spot is at the beginning and end, or somewhere in the
 | |
| 	 * middle. Wherever it is, the highest and lowest VCNs are adjacent to it. Otherwise the
 | |
| 	 * volume is cleanly saved and somewhere in the middle of it the highest VCN immediately
 | |
| 	 * precedes the lowest one.
 | |
| 	 */
 | |
| 
 | |
| 	/* It doesn't matter if this results in a bad spot (BAD_CHAPTER). */
 | |
| 	probe_chapter(volume, 0, &zero_vcn);
 | |
| 
 | |
| 	/*
 | |
| 	 * Binary search for end of the discontinuity in the monotonically increasing virtual
 | |
| 	 * chapter numbers; bad spots are treated as a span of BAD_CHAPTER values. In effect we're
 | |
| 	 * searching for the index of the smallest value less than zero_vcn. In the case we go off
 | |
| 	 * the end it means that chapter 0 has the lowest vcn.
 | |
| 	 *
 | |
| 	 * If a virtual chapter is out-of-order, it will be the one moved by conversion. Always
 | |
| 	 * skip over the moved chapter when searching, adding it to the range at the end if
 | |
| 	 * necessary.
 | |
| 	 */
 | |
| 	if (geometry->remapped_physical > 0) {
 | |
| 		u64 remapped_vcn;
 | |
| 
 | |
| 		probe_chapter(volume, geometry->remapped_physical, &remapped_vcn);
 | |
| 		if (remapped_vcn == geometry->remapped_virtual)
 | |
| 			moved_chapter = geometry->remapped_physical;
 | |
| 	}
 | |
| 
 | |
| 	left_chapter = 0;
 | |
| 	right_chapter = chapter_limit;
 | |
| 
 | |
| 	while (left_chapter < right_chapter) {
 | |
| 		u64 probe_vcn;
 | |
| 		u32 chapter = (left_chapter + right_chapter) / 2;
 | |
| 
 | |
| 		if (chapter == moved_chapter)
 | |
| 			chapter--;
 | |
| 
 | |
| 		probe_chapter(volume, chapter, &probe_vcn);
 | |
| 		if (zero_vcn <= probe_vcn) {
 | |
| 			left_chapter = chapter + 1;
 | |
| 			if (left_chapter == moved_chapter)
 | |
| 				left_chapter++;
 | |
| 		} else {
 | |
| 			right_chapter = chapter;
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	/* If left_chapter goes off the end, chapter 0 has the lowest virtual chapter number.*/
 | |
| 	if (left_chapter >= chapter_limit)
 | |
| 		left_chapter = 0;
 | |
| 
 | |
| 	/* At this point, left_chapter is the chapter with the lowest virtual chapter number. */
 | |
| 	probe_chapter(volume, left_chapter, &lowest);
 | |
| 
 | |
| 	/* The moved chapter might be the lowest in the range. */
 | |
| 	if ((moved_chapter != BAD_CHAPTER) && (lowest == geometry->remapped_virtual + 1))
 | |
| 		lowest = geometry->remapped_virtual;
 | |
| 
 | |
| 	/*
 | |
| 	 * Circularly scan backwards, moving over any bad chapters until encountering a good one,
 | |
| 	 * which is the chapter with the highest vcn.
 | |
| 	 */
 | |
| 	while (highest == BAD_CHAPTER) {
 | |
| 		right_chapter = (right_chapter + chapter_limit - 1) % chapter_limit;
 | |
| 		if (right_chapter == moved_chapter)
 | |
| 			continue;
 | |
| 
 | |
| 		probe_chapter(volume, right_chapter, &highest);
 | |
| 		if (bad_chapters++ >= MAX_BAD_CHAPTERS) {
 | |
| 			vdo_log_error("too many bad chapters in volume: %u",
 | |
| 				      bad_chapters);
 | |
| 			return UDS_CORRUPT_DATA;
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	*lowest_vcn = lowest;
 | |
| 	*highest_vcn = highest;
 | |
| 	return UDS_SUCCESS;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Find the highest and lowest contiguous chapters present in the volume and determine their
 | |
|  * virtual chapter numbers. This is used by rebuild.
 | |
|  */
 | |
| int uds_find_volume_chapter_boundaries(struct volume *volume, u64 *lowest_vcn,
 | |
| 				       u64 *highest_vcn, bool *is_empty)
 | |
| {
 | |
| 	u32 chapter_limit = volume->geometry->chapters_per_volume;
 | |
| 
 | |
| 	find_real_end_of_volume(volume, chapter_limit, &chapter_limit);
 | |
| 	if (chapter_limit == 0) {
 | |
| 		*lowest_vcn = 0;
 | |
| 		*highest_vcn = 0;
 | |
| 		*is_empty = true;
 | |
| 		return UDS_SUCCESS;
 | |
| 	}
 | |
| 
 | |
| 	*is_empty = false;
 | |
| 	return find_chapter_limits(volume, chapter_limit, lowest_vcn, highest_vcn);
 | |
| }
 | |
| 
 | |
| int __must_check uds_replace_volume_storage(struct volume *volume,
 | |
| 					    struct index_layout *layout,
 | |
| 					    struct block_device *bdev)
 | |
| {
 | |
| 	int result;
 | |
| 	u32 i;
 | |
| 
 | |
| 	result = uds_replace_index_layout_storage(layout, bdev);
 | |
| 	if (result != UDS_SUCCESS)
 | |
| 		return result;
 | |
| 
 | |
| 	/* Release all outstanding dm_bufio objects */
 | |
| 	for (i = 0; i < volume->page_cache.indexable_pages; i++)
 | |
| 		volume->page_cache.index[i] = volume->page_cache.cache_slots;
 | |
| 	for (i = 0; i < volume->page_cache.cache_slots; i++)
 | |
| 		clear_cache_page(&volume->page_cache, &volume->page_cache.cache[i]);
 | |
| 	if (volume->sparse_cache != NULL)
 | |
| 		uds_invalidate_sparse_cache(volume->sparse_cache);
 | |
| 	if (volume->client != NULL)
 | |
| 		dm_bufio_client_destroy(vdo_forget(volume->client));
 | |
| 
 | |
| 	return uds_open_volume_bufio(layout, volume->geometry->bytes_per_page,
 | |
| 				     volume->reserved_buffers, &volume->client);
 | |
| }
 | |
| 
 | |
| static int __must_check initialize_page_cache(struct page_cache *cache,
 | |
| 					      const struct index_geometry *geometry,
 | |
| 					      u32 chapters_in_cache,
 | |
| 					      unsigned int zone_count)
 | |
| {
 | |
| 	int result;
 | |
| 	u32 i;
 | |
| 
 | |
| 	cache->indexable_pages = geometry->pages_per_volume + 1;
 | |
| 	cache->cache_slots = chapters_in_cache * geometry->record_pages_per_chapter;
 | |
| 	cache->zone_count = zone_count;
 | |
| 	atomic64_set(&cache->clock, 1);
 | |
| 
 | |
| 	result = VDO_ASSERT((cache->cache_slots <= VOLUME_CACHE_MAX_ENTRIES),
 | |
| 			    "requested cache size, %u, within limit %u",
 | |
| 			    cache->cache_slots, VOLUME_CACHE_MAX_ENTRIES);
 | |
| 	if (result != VDO_SUCCESS)
 | |
| 		return result;
 | |
| 
 | |
| 	result = vdo_allocate(VOLUME_CACHE_MAX_QUEUED_READS, struct queued_read,
 | |
| 			      "volume read queue", &cache->read_queue);
 | |
| 	if (result != VDO_SUCCESS)
 | |
| 		return result;
 | |
| 
 | |
| 	result = vdo_allocate(cache->zone_count, struct search_pending_counter,
 | |
| 			      "Volume Cache Zones", &cache->search_pending_counters);
 | |
| 	if (result != VDO_SUCCESS)
 | |
| 		return result;
 | |
| 
 | |
| 	result = vdo_allocate(cache->indexable_pages, u16, "page cache index",
 | |
| 			      &cache->index);
 | |
| 	if (result != VDO_SUCCESS)
 | |
| 		return result;
 | |
| 
 | |
| 	result = vdo_allocate(cache->cache_slots, struct cached_page, "page cache cache",
 | |
| 			      &cache->cache);
 | |
| 	if (result != VDO_SUCCESS)
 | |
| 		return result;
 | |
| 
 | |
| 	/* Initialize index values to invalid values. */
 | |
| 	for (i = 0; i < cache->indexable_pages; i++)
 | |
| 		cache->index[i] = cache->cache_slots;
 | |
| 
 | |
| 	for (i = 0; i < cache->cache_slots; i++)
 | |
| 		clear_cache_page(cache, &cache->cache[i]);
 | |
| 
 | |
| 	return UDS_SUCCESS;
 | |
| }
 | |
| 
 | |
| int uds_make_volume(const struct uds_configuration *config, struct index_layout *layout,
 | |
| 		    struct volume **new_volume)
 | |
| {
 | |
| 	unsigned int i;
 | |
| 	struct volume *volume = NULL;
 | |
| 	struct index_geometry *geometry;
 | |
| 	unsigned int reserved_buffers;
 | |
| 	int result;
 | |
| 
 | |
| 	result = vdo_allocate(1, struct volume, "volume", &volume);
 | |
| 	if (result != VDO_SUCCESS)
 | |
| 		return result;
 | |
| 
 | |
| 	volume->nonce = uds_get_volume_nonce(layout);
 | |
| 
 | |
| 	result = uds_copy_index_geometry(config->geometry, &volume->geometry);
 | |
| 	if (result != UDS_SUCCESS) {
 | |
| 		uds_free_volume(volume);
 | |
| 		return vdo_log_warning_strerror(result,
 | |
| 						"failed to allocate geometry: error");
 | |
| 	}
 | |
| 	geometry = volume->geometry;
 | |
| 
 | |
| 	/*
 | |
| 	 * Reserve a buffer for each entry in the page cache, one for the chapter writer, and one
 | |
| 	 * for each entry in the sparse cache.
 | |
| 	 */
 | |
| 	reserved_buffers = config->cache_chapters * geometry->record_pages_per_chapter;
 | |
| 	reserved_buffers += 1;
 | |
| 	if (uds_is_sparse_index_geometry(geometry))
 | |
| 		reserved_buffers += (config->cache_chapters * geometry->index_pages_per_chapter);
 | |
| 	volume->reserved_buffers = reserved_buffers;
 | |
| 	result = uds_open_volume_bufio(layout, geometry->bytes_per_page,
 | |
| 				       volume->reserved_buffers, &volume->client);
 | |
| 	if (result != UDS_SUCCESS) {
 | |
| 		uds_free_volume(volume);
 | |
| 		return result;
 | |
| 	}
 | |
| 
 | |
| 	result = uds_make_radix_sorter(geometry->records_per_page,
 | |
| 				       &volume->radix_sorter);
 | |
| 	if (result != UDS_SUCCESS) {
 | |
| 		uds_free_volume(volume);
 | |
| 		return result;
 | |
| 	}
 | |
| 
 | |
| 	result = vdo_allocate(geometry->records_per_page,
 | |
| 			      const struct uds_volume_record *, "record pointers",
 | |
| 			      &volume->record_pointers);
 | |
| 	if (result != VDO_SUCCESS) {
 | |
| 		uds_free_volume(volume);
 | |
| 		return result;
 | |
| 	}
 | |
| 
 | |
| 	if (uds_is_sparse_index_geometry(geometry)) {
 | |
| 		size_t page_size = sizeof(struct delta_index_page) + geometry->bytes_per_page;
 | |
| 
 | |
| 		result = uds_make_sparse_cache(geometry, config->cache_chapters,
 | |
| 					       config->zone_count,
 | |
| 					       &volume->sparse_cache);
 | |
| 		if (result != UDS_SUCCESS) {
 | |
| 			uds_free_volume(volume);
 | |
| 			return result;
 | |
| 		}
 | |
| 
 | |
| 		volume->cache_size =
 | |
| 			page_size * geometry->index_pages_per_chapter * config->cache_chapters;
 | |
| 	}
 | |
| 
 | |
| 	result = initialize_page_cache(&volume->page_cache, geometry,
 | |
| 				       config->cache_chapters, config->zone_count);
 | |
| 	if (result != UDS_SUCCESS) {
 | |
| 		uds_free_volume(volume);
 | |
| 		return result;
 | |
| 	}
 | |
| 
 | |
| 	volume->cache_size += volume->page_cache.cache_slots * sizeof(struct delta_index_page);
 | |
| 	result = uds_make_index_page_map(geometry, &volume->index_page_map);
 | |
| 	if (result != UDS_SUCCESS) {
 | |
| 		uds_free_volume(volume);
 | |
| 		return result;
 | |
| 	}
 | |
| 
 | |
| 	mutex_init(&volume->read_threads_mutex);
 | |
| 	uds_init_cond(&volume->read_threads_read_done_cond);
 | |
| 	uds_init_cond(&volume->read_threads_cond);
 | |
| 
 | |
| 	result = vdo_allocate(config->read_threads, struct thread *, "reader threads",
 | |
| 			      &volume->reader_threads);
 | |
| 	if (result != VDO_SUCCESS) {
 | |
| 		uds_free_volume(volume);
 | |
| 		return result;
 | |
| 	}
 | |
| 
 | |
| 	for (i = 0; i < config->read_threads; i++) {
 | |
| 		result = vdo_create_thread(read_thread_function, (void *) volume,
 | |
| 					   "reader", &volume->reader_threads[i]);
 | |
| 		if (result != VDO_SUCCESS) {
 | |
| 			uds_free_volume(volume);
 | |
| 			return result;
 | |
| 		}
 | |
| 
 | |
| 		volume->read_thread_count = i + 1;
 | |
| 	}
 | |
| 
 | |
| 	*new_volume = volume;
 | |
| 	return UDS_SUCCESS;
 | |
| }
 | |
| 
 | |
| static void uninitialize_page_cache(struct page_cache *cache)
 | |
| {
 | |
| 	u16 i;
 | |
| 
 | |
| 	if (cache->cache != NULL) {
 | |
| 		for (i = 0; i < cache->cache_slots; i++)
 | |
| 			release_page_buffer(&cache->cache[i]);
 | |
| 	}
 | |
| 	vdo_free(cache->index);
 | |
| 	vdo_free(cache->cache);
 | |
| 	vdo_free(cache->search_pending_counters);
 | |
| 	vdo_free(cache->read_queue);
 | |
| }
 | |
| 
 | |
| void uds_free_volume(struct volume *volume)
 | |
| {
 | |
| 	if (volume == NULL)
 | |
| 		return;
 | |
| 
 | |
| 	if (volume->reader_threads != NULL) {
 | |
| 		unsigned int i;
 | |
| 
 | |
| 		/* This works even if some threads weren't started. */
 | |
| 		mutex_lock(&volume->read_threads_mutex);
 | |
| 		volume->read_threads_exiting = true;
 | |
| 		uds_broadcast_cond(&volume->read_threads_cond);
 | |
| 		mutex_unlock(&volume->read_threads_mutex);
 | |
| 		for (i = 0; i < volume->read_thread_count; i++)
 | |
| 			vdo_join_threads(volume->reader_threads[i]);
 | |
| 		vdo_free(volume->reader_threads);
 | |
| 		volume->reader_threads = NULL;
 | |
| 	}
 | |
| 
 | |
| 	/* Must destroy the client AFTER freeing the cached pages. */
 | |
| 	uninitialize_page_cache(&volume->page_cache);
 | |
| 	uds_free_sparse_cache(volume->sparse_cache);
 | |
| 	if (volume->client != NULL)
 | |
| 		dm_bufio_client_destroy(vdo_forget(volume->client));
 | |
| 
 | |
| 	uds_free_index_page_map(volume->index_page_map);
 | |
| 	uds_free_radix_sorter(volume->radix_sorter);
 | |
| 	vdo_free(volume->geometry);
 | |
| 	vdo_free(volume->record_pointers);
 | |
| 	vdo_free(volume);
 | |
| }
 |