Coyote/driver/hypervisor/hypervisor_mmu.c

546 lines
17 KiB
C

/**
* Copyright (c) 2023, Systems Group, ETH Zurich
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "hypervisor_mmu.h"
/**
* @brief Pin user pages allocated in a vm.
* This is a modified version to work on the notifier that is passed
* down from the vm.
*
* The function reads the notifier and pins the pages from the vm.
* The notifier contains gpa addresses. So first the function performs
* a page table walk from guest physical address (gpa) to host virtual address (hva).
* We use the hva to pin the corresponding physical pages.
*
* After this step, this function will install mappings on the fpga TLB that correspond
* to the virtual address given in the notifier. If the function uses the
* huge table TLB or the small table TLB depends on the notifier. This information
* is passed down from the hypervisor. This is because if the vm runs with
* huge pages all pages inside the vm are seen as huge pages by the hypervisor
* so we have to pass through the intent on the user with the hypervisor.
*
* In a last step, we actual fire the mapping to the fpga.
*
* @param d mediated dev
* @param notifier notifier from the vm, already copied into the kernel
* @return int 0 if successfull
*/
int hypervisor_tlb_get_user_pages(struct m_fpga_dev *d, struct hypervisor_map_notifier *notifier)
{
int ret_val, i, j;
struct fpga_dev *fd;
struct bus_drvdata *pd;
struct mm_struct *curr_mm;
struct task_struct *curr_task;
struct kvm *kvm;
pid_t pid;
uint64_t first, last;
uint64_t curr_vaddr, last_vaddr, vaddr_tmp, gva;
int n_pages, n_pages_huge;
int hugepages;
struct user_pages *user_pg;
uint64_t *hpages_phys, *map_array;
uint64_t count;
uint64_t *kvm_hvas;
ret_val = 0;
BUG_ON(!d);
fd = d->fpga;
BUG_ON(!fd);
pd = fd->pd;
BUG_ON(!pd);
kvm = d->kvm;
BUG_ON(!kvm);
BUG_ON(!notifier);
BUG_ON(notifier->npages == 0);
// number of pages
n_pages = notifier->npages;
gva = notifier->gva;
dbg_info("Going to pin %d pages for gva %llx\n", n_pages, gva);
// Get mmu context from kvm process
curr_mm = kvm->mm;
pid = kvm->userspace_pid;
curr_task = pid_task(find_vpid(pid), PIDTYPE_PID);
// get first host virtual address in kvm space
kvm_hvas = kcalloc(notifier->npages, sizeof(uint64_t), GFP_KERNEL);
if (!kvm_hvas)
{
goto err_hvas;
}
for (i = 0; i < n_pages; i++)
{
kvm_hvas[i] = gfn_to_hva(kvm, gpa_to_gfn(notifier->gpas[i]));
}
count = notifier->len;
// hugepages support passed from vm
hugepages = (int) notifier->is_huge;
if (hugepages)
{
if (n_pages > MAX_N_MAP_HUGE_PAGES)
n_pages = MAX_N_MAP_HUGE_PAGES;
}
else
{
if (n_pages > MAX_N_MAP_PAGES)
n_pages = MAX_N_MAP_PAGES;
}
// overflow check
if (gva + count < gva)
return -EINVAL;
if (count == 0)
return 0;
// allocate management structs
user_pg = kzalloc(sizeof(struct user_pages), GFP_KERNEL);
if (!user_pg)
{
ret_val = -ENOMEM;
goto err_user_pg;
}
user_pg->hpages = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
if (!user_pg->hpages)
{
ret_val = -ENOMEM;
goto err_hpages;
}
// Pin all pages obtained from the vm
for (i = 0; i < n_pages; i++)
{
// pin pages of the kvm
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0)
ret_val = get_user_pages_remote(curr_mm, (unsigned long)kvm_hvas[i], 1, 1, user_pg->hpages + i, NULL, NULL);
#else
ret_val = get_user_pages_remote(curr_task, curr_mm, (unsigned long)kvm_hvas[i], n_pages, 1, user_pg->hpages + i, NULL);
#endif
if (ret_val != 1 || !user_pg->hpages[i])
{
pr_info("%s.%d: Failed to pin all pages, failed to map %d with ret_val %d", __func__, __LINE__, i, ret_val);
goto err_pin_pages;
}
// dbg_info("pinned page hpa: %llx\n", page_to_phys(user_pg->hpages[i]));
}
dbg_info("Pinned pages\n");
// Reset ret_val
ret_val = 0;
// flush cache
for (i = 0; i < n_pages; i++)
{
flush_dcache_page(user_pg->hpages[i]);
}
// populate map entry
user_pg->vaddr = gva;
user_pg->n_hpages = n_pages;
user_pg->huge = hugepages;
dbg_info("mapping vaddr %llx, cpid %llu, hugepages %llu\n", gva, notifier->cpid, notifier->is_huge);
vaddr_tmp = gva;
if (hugepages) // For hugepages
{
// Shift page numbers to work on huge pages
first = (gva & pd->ltlb_order->page_mask) >> pd->ltlb_order->page_shift;
last = ((gva + count - 1) & pd->ltlb_order->page_mask) >> pd->ltlb_order->page_shift;
n_pages_huge = last - first + 1;
user_pg->n_pages = n_pages_huge;
// allocate pages array
hpages_phys = kzalloc(n_pages_huge * sizeof(uint64_t), GFP_KERNEL);
if (!hpages_phys)
{
ret_val = -ENOMEM;
goto err_phys_pages;
}
j = 0;
curr_vaddr = gva;
last_vaddr = -1;
// Get the hpa for the huge pages
for (i = 0; i < n_pages; i++)
{
// Only store an entry if we encounter a new huge page
if (((curr_vaddr & pd->ltlb_order->page_mask) >> pd->ltlb_order->page_shift) !=
((last_vaddr & pd->ltlb_order->page_mask) >> pd->ltlb_order->page_shift))
{
hpages_phys[j] = page_to_phys(user_pg->hpages[i]) & pd->ltlb_order->page_mask;
last_vaddr = curr_vaddr;
j++;
}
curr_vaddr += PAGE_SIZE;
}
// If we have memory attached on the card we want to allocate
// the same amount of memory on the card
if (pd->en_mem)
{
// Allocate memory
user_pg->cpages = kzalloc(n_pages_huge * sizeof(uint64_t), GFP_KERNEL);
if (!user_pg->cpages)
{
pr_info("Failed to allocate card buffer");
ret_val = -ENOMEM;
goto err_cpages;
}
// Alloc 2MB chunks from the card
ret_val = card_alloc(fd, user_pg->cpages, n_pages_huge, LARGE_CHUNK_ALLOC);
if (ret_val)
{
pr_info("Failed to allocate card memory");
ret_val = -ENOMEM;
goto err_card_mem;
}
dbg_info("card allocated %d hugepages in hypervisor\n", n_pages_huge);
}
// alloc map array
map_array = kzalloc(n_pages_huge * 2 * sizeof(uint64_t), GFP_KERNEL);
if (!map_array)
{
pr_info("Failed to allocate map buffers\n");
goto err_map_buffer;
}
vaddr_tmp = gva;
// populate map with mappings from guest virtual address
// to host physical addresses
for (i = 0; i < n_pages_huge; i++)
{
tlb_create_map(pd->ltlb_order,
vaddr_tmp,
hpages_phys[i],
(pd->en_mem ? user_pg->cpages[i] : 0),
notifier->cpid,
&map_array[2 * i]);
vaddr_tmp += pd->ltlb_order->page_size;
}
#ifndef HYPERVISOR_TEST
// Fire the fpga to the fpga
tlb_service_dev(fd, pd->ltlb_order, map_array, n_pages_huge);
#endif
kfree(map_array);
kfree(hpages_phys);
}
else // For small pages
{
user_pg->n_pages = n_pages;
// if memory is attached to the card we want to allocate
// the same amount on the card
if (pd->en_mem)
{
// Allocate pages management array
user_pg->cpages = kzalloc(n_pages * sizeof(uint64_t), GFP_KERNEL);
if (!user_pg->cpages)
{
dbg_info("could not allocate card buffer\n");
ret_val = -ENOMEM;
goto err_cpages;
}
// Allocate 4KB chunks of card memory
ret_val = card_alloc(fd, user_pg->cpages, n_pages, SMALL_CHUNK_ALLOC);
if (ret_val)
{
dbg_info("could not get all card pages, %d\n", ret_val);
goto err_card_mem;
}
}
// allocate map array
map_array = (uint64_t *)kzalloc(n_pages * 2 * sizeof(uint64_t), GFP_KERNEL);
if (!map_array)
{
dbg_info("map buffers could not be allocated\n");
return -ENOMEM;
}
// populate mappings array with guest virtual address
// to host physical mapping.
for (i = 0; i < n_pages; i++)
{
tlb_create_map(pd->stlb_order,
vaddr_tmp,
page_to_phys(user_pg->hpages[i]),
(pd->en_mem ? user_pg->cpages[i] : 0),
notifier->cpid, &map_array[2 * i]);
vaddr_tmp += pd->stlb_order->page_size;
}
#ifndef HYPERVISOR_TEST
// fire interrupt to install the mappings on the fpga
tlb_service_dev(fd, pd->stlb_order, map_array, n_pages);
#endif
// free buffers
kfree(map_array);
kfree(hpages_phys);
}
// Add entry into the sbuff hash map. This is for
// managment of allocated memory and allows deallocation
// later on.
hash_add(d->sbuff_map, &user_pg->entry, notifier->gva);
return ret_val;
err_pin_pages:
for (j = 0; j < i; j++)
{
put_page(user_pg->hpages[j]);
}
kfree(user_pg->hpages);
kfree(user_pg);
kfree(kvm_hvas);
return -ENOMEM;
err_map_buffer:
card_free(fd, user_pg->cpages, n_pages_huge, LARGE_CHUNK_ALLOC);
err_card_mem:
kfree(user_pg->cpages);
err_cpages:
kfree(hpages_phys);
err_phys_pages:
for (i = 0; i < user_pg->n_hpages; i++)
{
put_page(user_pg->hpages[i]);
}
err_hpages:
kfree(user_pg);
err_user_pg:
kfree(kvm_hvas);
err_hvas:
return ret_val;
}
/**
* @brief Unmap an entry described by a
* tmp_buffer. This is the hypervisor version
* of the put user pages. This code is a small refactor of the original
* version since this function contains the code to actually unmap
* a buffer and in turn is used by the put and put all functions.
*
* @param md mediated device
* @param tmp_buffer user pages struct that describes the mapped region
* @param dirtied Indicates if all pages should be marked dirty before putting
* @return int 0 if successfull
*/
static int unmap_entry(struct m_fpga_dev *md, struct user_pages *tmp_buffer, int dirtied)
{
int i;
struct fpga_dev *d;
struct bus_drvdata *pd;
uint64_t vaddr_tmp, vaddr;
uint64_t *map_array;
int32_t cpid;
struct tlb_order *tlb_order;
BUG_ON(!md);
d = md->fpga;
BUG_ON(!d);
pd = d->pd;
BUG_ON(!pd);
BUG_ON(!tmp_buffer);
BUG_ON(!(tmp_buffer->hpages));
dbg_info("Putting user_pages entry\n");
vaddr = tmp_buffer->vaddr;
cpid = tmp_buffer->cpid;
// If the pages should be dirtied,
// go through all pages and mark them as dirty.
if (dirtied)
{
for (i = 0; i < tmp_buffer->n_hpages; i++)
{
if (tmp_buffer->hpages[i])
SetPageDirty(tmp_buffer->hpages[i]);
else
dbg_info("entry for page %d is NULL!\n", i);
}
dbg_info("Marked pages as dirty\n");
}
// Put all pages that belong to this buffer
// and allow the machine to evict them from memory
// if it desires to do so.
for (i = 0; i < tmp_buffer->n_hpages; i++)
{
// dbg_info("Putting page %d...\n", i);
if (tmp_buffer->hpages[i])
put_page(tmp_buffer->hpages[i]);
else
dbg_info("entry for page %d is NULL!\n", i);
}
// release card pages
if (pd->en_mem)
{
card_free(d, tmp_buffer->cpages, tmp_buffer->n_pages,
tmp_buffer->huge ? LARGE_CHUNK_ALLOC : SMALL_CHUNK_ALLOC);
}
//
// Unmap from the TLB
//
vaddr_tmp = vaddr;
// alloc map array
map_array = (uint64_t *)kzalloc(tmp_buffer->n_pages * 2 * sizeof(uint64_t), GFP_KERNEL);
if (!map_array)
{
dbg_info("map buffers could not be allocated\n");
return -ENOMEM;
}
tlb_order = tmp_buffer->huge ? pd->ltlb_order : pd->stlb_order;
for (i = 0; i < tmp_buffer->n_pages; i++)
{
// This code works for huge and small pages and therefore
// there is no need here to seperate the code for the two cases.
// Create unmap entries in the map array.
tlb_create_unmap(tlb_order, vaddr_tmp, cpid, &map_array[2 * i]);
vaddr_tmp += tlb_order->page_size;
}
#ifndef HYPERVISOR_TEST
// Fire to actually remove the mappings from the tlb.
tlb_service_dev(d, tlb_order, map_array, tmp_buffer->n_pages);
#endif
kfree(map_array);
dbg_info("Successfully put user pages at gva %llx consisting of %llu pages for cpid %d\n",
tmp_buffer->vaddr, tmp_buffer->n_hpages, tmp_buffer->cpid);
return 0;
}
/**
* @brief Put all card and kernel pages and therefore allow evicitions,
* as described by the notifier. The notifier contains the vaddr of the
* region that should be put. This function searches the hash table
* for a corresponding mapping and uses the unmap_entry function
* to unmap the corresponding buffer. Afterwards it removes
* the mapping from the hashtable.
*
* @param md mediated device
* @param notifier notifier passed from the vm, copied into kernelspace
* @return int 0 if successfull
*/
int hypervisor_tlb_put_user_pages(struct m_fpga_dev *md, struct hypervisor_map_notifier *notifier)
{
struct user_pages *tmp_buff;
struct fpga_dev *d;
struct bus_drvdata *pd;
uint64_t vaddr;
uint64_t dirtied;
int32_t cpid;
BUG_ON(!md);
d = md->fpga;
BUG_ON(!d);
pd = d->pd;
BUG_ON(!notifier);
vaddr = notifier->gva;
dirtied = notifier->dirtied;
cpid = notifier->cpid;
// Find all user mappings that map to the same bucket as
// vaddr.
hash_for_each_possible(md->sbuff_map, tmp_buff, entry, vaddr)
{
// Check if it is the correct entry
if (tmp_buff->vaddr == vaddr && tmp_buff->cpid == cpid)
{
// unmap from TLB
unmap_entry(md, tmp_buff, dirtied);
// delete from hashtable
hash_del(&tmp_buff->entry);
// free memory
kfree(tmp_buff->hpages);
kfree(tmp_buff);
}
}
return 0;
}
/**
* @brief Similar to hypervisor_tlb_put_user_pages but put all pages
* hold by the device md.
*
* @param md mediated device
* @param dirtied indicates if all pages should be marked dirty before putting
* @return int 0 if successfull
*/
int hypervisor_tlb_put_user_pages_all(struct m_fpga_dev *md, int dirtied)
{
struct fpga_dev *d;
struct bus_drvdata *pd;
int bkt;
struct user_pages *tmp_buff;
BUG_ON(!md);
d = md->fpga;
BUG_ON(!d);
pd = d->pd;
BUG_ON(!pd);
// Iterate through all mappings and unmap all
hash_for_each(md->sbuff_map, bkt, tmp_buff, entry)
{
// unmap from TLB
unmap_entry(md, tmp_buff, dirtied);
// delete from hash table
hash_del(&tmp_buff->entry);
// free memory
kfree(tmp_buff->hpages);
kfree(tmp_buff);
}
return 0;
}