Coyote/driver/hypervisor/hypervisor_mmu.c

/**
 * Copyright (c) 2023, Systems Group, ETH Zurich
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "hypervisor_mmu.h"

/**
 * @brief Pin user pages allocated in a vm.
 *  This is a modified version to work on the notifier that is passed
 *  down from the vm.
 *
 * The function reads the notifier and pins the pages from the vm.
 * The notifier contains gpa addresses. So first the function performs
 * a page table walk from guest physical address (gpa) to host virtual address (hva).
 * We use the hva to pin the corresponding physical pages.
 *
 * After this step, this function will install mappings on the fpga TLB that correspond
 * to the virtual address given in the notifier. If the function uses the
 * huge table TLB or the small table TLB depends on the notifier. This information
 * is passed down from the hypervisor. This is because if the vm runs with
 * huge pages all pages inside the vm are seen as huge pages by the hypervisor
 * so we have to pass through the intent on the user with the hypervisor.
 *
 * In a last step, we actual fire the mapping to the fpga.
 *
 * @param d mediated dev
 * @param notifier notifier from the vm, already copied into the kernel
 * @return int 0 if successfull
 */
int hypervisor_tlb_get_user_pages(struct m_fpga_dev *d, struct hypervisor_map_notifier *notifier)
{
    int ret_val, i, j;
    struct fpga_dev *fd;
    struct bus_drvdata *pd;
    struct mm_struct *curr_mm;
    struct task_struct *curr_task;
    struct kvm *kvm;
    pid_t pid;
    uint64_t first, last;
    uint64_t curr_vaddr, last_vaddr, vaddr_tmp, gva;
    int n_pages, n_pages_huge;
    int hugepages;
    struct user_pages *user_pg;
    uint64_t *hpages_phys, *map_array;
    uint64_t count;
    uint64_t *kvm_hvas;

    ret_val = 0;

    BUG_ON(!d);
    fd = d->fpga;
    BUG_ON(!fd);
    pd = fd->pd;
    BUG_ON(!pd);
    kvm = d->kvm;
    BUG_ON(!kvm);
    BUG_ON(!notifier);
    BUG_ON(notifier->npages == 0);

    // number of pages
    n_pages = notifier->npages;
    gva = notifier->gva;
    dbg_info("Going to pin %d pages for gva %llx\n", n_pages, gva);

    // Get mmu context from kvm process
    curr_mm = kvm->mm;
    pid = kvm->userspace_pid;
    curr_task = pid_task(find_vpid(pid), PIDTYPE_PID);

    // get first host virtual address in kvm space
    kvm_hvas = kcalloc(notifier->npages, sizeof(uint64_t), GFP_KERNEL);
    if (!kvm_hvas)
    {
        goto err_hvas;
    }

    for (i = 0; i < n_pages; i++)
    {
        kvm_hvas[i] = gfn_to_hva(kvm, gpa_to_gfn(notifier->gpas[i]));
    }
    count = notifier->len;

    // hugepages support passed from vm
    hugepages = (int) notifier->is_huge;

    if (hugepages)
    {
        if (n_pages > MAX_N_MAP_HUGE_PAGES)
            n_pages = MAX_N_MAP_HUGE_PAGES;
    }
    else
    {
        if (n_pages > MAX_N_MAP_PAGES)
            n_pages = MAX_N_MAP_PAGES;
    }

    // overflow check
    if (gva + count < gva)
        return -EINVAL;
    if (count == 0)
        return 0;

    // allocate management structs
    user_pg = kzalloc(sizeof(struct user_pages), GFP_KERNEL);
    if (!user_pg)
    {
        ret_val = -ENOMEM;
        goto err_user_pg;
    }

    user_pg->hpages = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
    if (!user_pg->hpages)
    {
        ret_val = -ENOMEM;
        goto err_hpages;
    }

    // Pin all pages obtained from the vm
    for (i = 0; i < n_pages; i++)
    {
        // pin pages of the kvm
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0)
        ret_val = get_user_pages_remote(curr_mm, (unsigned long)kvm_hvas[i], 1, 1, user_pg->hpages + i, NULL, NULL);
#else
        ret_val = get_user_pages_remote(curr_task, curr_mm, (unsigned long)kvm_hvas[i], n_pages, 1, user_pg->hpages + i, NULL);
#endif
        if (ret_val != 1 || !user_pg->hpages[i])
        {
            pr_info("%s.%d: Failed to pin all pages, failed to map %d with ret_val %d", __func__, __LINE__, i, ret_val);
            goto err_pin_pages;
        }
        // dbg_info("pinned page hpa: %llx\n", page_to_phys(user_pg->hpages[i]));
    }

    dbg_info("Pinned pages\n");

    // Reset ret_val
    ret_val = 0;

    // flush cache
    for (i = 0; i < n_pages; i++)
    {
        flush_dcache_page(user_pg->hpages[i]);
    }

    // populate map entry
    user_pg->vaddr = gva;
    user_pg->n_hpages = n_pages;
    user_pg->huge = hugepages;

    dbg_info("mapping vaddr %llx, cpid %llu, hugepages %llu\n", gva, notifier->cpid, notifier->is_huge);

    vaddr_tmp = gva;

    if (hugepages) // For hugepages
    {
        // Shift page numbers to work on huge pages
        first = (gva & pd->ltlb_order->page_mask) >> pd->ltlb_order->page_shift;
        last = ((gva + count - 1) & pd->ltlb_order->page_mask) >> pd->ltlb_order->page_shift;
        n_pages_huge = last - first + 1;
        user_pg->n_pages = n_pages_huge;

        // allocate pages array
        hpages_phys = kzalloc(n_pages_huge * sizeof(uint64_t), GFP_KERNEL);
        if (!hpages_phys)
        {
            ret_val = -ENOMEM;
            goto err_phys_pages;
        }

        j = 0;
        curr_vaddr = gva;
        last_vaddr = -1;

        // Get the hpa for the huge pages
        for (i = 0; i < n_pages; i++)
        {
            // Only store an entry if we encounter a new huge page
            if (((curr_vaddr & pd->ltlb_order->page_mask) >> pd->ltlb_order->page_shift) !=
                ((last_vaddr & pd->ltlb_order->page_mask) >> pd->ltlb_order->page_shift))
            {
                hpages_phys[j] = page_to_phys(user_pg->hpages[i]) & pd->ltlb_order->page_mask;
                last_vaddr = curr_vaddr;
                j++;
            }
            curr_vaddr += PAGE_SIZE;
        }

        // If we have memory attached on the card we want to allocate
        // the same amount of memory on the card
        if (pd->en_mem)
        {
            // Allocate memory
            user_pg->cpages = kzalloc(n_pages_huge * sizeof(uint64_t), GFP_KERNEL);
            if (!user_pg->cpages)
            {
                pr_info("Failed to allocate card buffer");
                ret_val = -ENOMEM;
                goto err_cpages;
            }

            // Alloc 2MB chunks from the card
            ret_val = card_alloc(fd, user_pg->cpages, n_pages_huge, LARGE_CHUNK_ALLOC);
            if (ret_val)
            {
                pr_info("Failed to allocate card memory");
                ret_val = -ENOMEM;
                goto err_card_mem;
            }
            dbg_info("card allocated %d hugepages in hypervisor\n", n_pages_huge);
        }

        // alloc map array
        map_array = kzalloc(n_pages_huge * 2 * sizeof(uint64_t), GFP_KERNEL);
        if (!map_array)
        {
            pr_info("Failed to allocate map buffers\n");
            goto err_map_buffer;
        }

        vaddr_tmp = gva;

        // populate map with mappings from guest virtual address
        // to host physical addresses
        for (i = 0; i < n_pages_huge; i++)
        {
            tlb_create_map(pd->ltlb_order,
                           vaddr_tmp,
                           hpages_phys[i],
                           (pd->en_mem ? user_pg->cpages[i] : 0),
                           notifier->cpid,
                           &map_array[2 * i]);
            vaddr_tmp += pd->ltlb_order->page_size;
        }

#ifndef HYPERVISOR_TEST
        // Fire the fpga to the fpga
        tlb_service_dev(fd, pd->ltlb_order, map_array, n_pages_huge);
#endif

        kfree(map_array);
        kfree(hpages_phys);
    }
    else // For small pages
    {
        user_pg->n_pages = n_pages;

        // if memory is attached to the card we want to allocate
        // the same amount on the card
        if (pd->en_mem)
        {
            // Allocate pages management array
            user_pg->cpages = kzalloc(n_pages * sizeof(uint64_t), GFP_KERNEL);
            if (!user_pg->cpages)
            {
                dbg_info("could not allocate card buffer\n");
                ret_val = -ENOMEM;
                goto err_cpages;
            }

            // Allocate 4KB chunks of card memory
            ret_val = card_alloc(fd, user_pg->cpages, n_pages, SMALL_CHUNK_ALLOC);
            if (ret_val)
            {
                dbg_info("could not get all card pages, %d\n", ret_val);
                goto err_card_mem;
            }
        }

        // allocate map array
        map_array = (uint64_t *)kzalloc(n_pages * 2 * sizeof(uint64_t), GFP_KERNEL);
        if (!map_array)
        {
            dbg_info("map buffers could not be allocated\n");
            return -ENOMEM;
        }

        // populate mappings array with guest virtual address
        // to host physical mapping.
        for (i = 0; i < n_pages; i++)
        {
            tlb_create_map(pd->stlb_order,
                           vaddr_tmp,
                           page_to_phys(user_pg->hpages[i]),
                           (pd->en_mem ? user_pg->cpages[i] : 0),
                           notifier->cpid, &map_array[2 * i]);
            vaddr_tmp += pd->stlb_order->page_size;
        }

#ifndef HYPERVISOR_TEST
        // fire interrupt to install the mappings on the fpga
        tlb_service_dev(fd, pd->stlb_order, map_array, n_pages);
#endif

        // free buffers
        kfree(map_array);
        kfree(hpages_phys);
    }

    // Add entry into the sbuff hash map. This is for
    // managment of allocated memory and allows deallocation
    // later on.
    hash_add(d->sbuff_map, &user_pg->entry, notifier->gva);

    return ret_val;

err_pin_pages:
    for (j = 0; j < i; j++)
    {
        put_page(user_pg->hpages[j]);
    }

    kfree(user_pg->hpages);
    kfree(user_pg);
    kfree(kvm_hvas);
    return -ENOMEM;

err_map_buffer:
    card_free(fd, user_pg->cpages, n_pages_huge, LARGE_CHUNK_ALLOC);
err_card_mem:
    kfree(user_pg->cpages);
err_cpages:
    kfree(hpages_phys);
err_phys_pages:
    for (i = 0; i < user_pg->n_hpages; i++)
    {
        put_page(user_pg->hpages[i]);
    }
err_hpages:
    kfree(user_pg);
err_user_pg:
    kfree(kvm_hvas);
err_hvas:
    return ret_val;
}

/**
 * @brief Unmap an entry described by a
 *  tmp_buffer. This is the hypervisor version
 *  of the put user pages. This code is a small refactor of the original
 *  version since this function contains the code to actually unmap
 *  a buffer and in turn is used by the put and put all functions.
 *
 * @param md mediated device
 * @param tmp_buffer user pages struct that describes the mapped region
 * @param dirtied Indicates if all pages should be marked dirty before putting
 * @return int 0 if successfull
 */
static int unmap_entry(struct m_fpga_dev *md, struct user_pages *tmp_buffer, int dirtied)
{
    int i;
    struct fpga_dev *d;
    struct bus_drvdata *pd;
    uint64_t vaddr_tmp, vaddr;
    uint64_t *map_array;
    int32_t cpid;
    struct tlb_order *tlb_order;

    BUG_ON(!md);
    d = md->fpga;
    BUG_ON(!d);
    pd = d->pd;
    BUG_ON(!pd);
    BUG_ON(!tmp_buffer);
    BUG_ON(!(tmp_buffer->hpages));

    dbg_info("Putting user_pages entry\n");

    vaddr = tmp_buffer->vaddr;
    cpid = tmp_buffer->cpid;

    // If the pages should be dirtied,
    // go through all pages and mark them as dirty.
    if (dirtied)
    {
        for (i = 0; i < tmp_buffer->n_hpages; i++)
        {
            if (tmp_buffer->hpages[i])
                SetPageDirty(tmp_buffer->hpages[i]);
            else
                dbg_info("entry for page %d is NULL!\n", i);
        }
        dbg_info("Marked pages as dirty\n");
    }

    // Put all pages that belong to this buffer
    // and allow the machine to evict them from memory
    // if it desires to do so.
    for (i = 0; i < tmp_buffer->n_hpages; i++)
    {
        // dbg_info("Putting page %d...\n", i);
        if (tmp_buffer->hpages[i])
            put_page(tmp_buffer->hpages[i]);
        else
            dbg_info("entry for page %d is NULL!\n", i);
    }

    // release card pages
    if (pd->en_mem)
    {
        card_free(d, tmp_buffer->cpages, tmp_buffer->n_pages,
                  tmp_buffer->huge ? LARGE_CHUNK_ALLOC : SMALL_CHUNK_ALLOC);
    }

    //
    // Unmap from the TLB
    //
    vaddr_tmp = vaddr;

    // alloc map array
    map_array = (uint64_t *)kzalloc(tmp_buffer->n_pages * 2 * sizeof(uint64_t), GFP_KERNEL);
    if (!map_array)
    {
        dbg_info("map buffers could not be allocated\n");
        return -ENOMEM;
    }

    tlb_order = tmp_buffer->huge ? pd->ltlb_order : pd->stlb_order;

    for (i = 0; i < tmp_buffer->n_pages; i++)
    {
        // This code works for huge and small pages and therefore
        // there is no need here to seperate the code for the two cases.
        // Create unmap entries in the map array.
        tlb_create_unmap(tlb_order, vaddr_tmp, cpid, &map_array[2 * i]);
        vaddr_tmp += tlb_order->page_size;
    }
#ifndef HYPERVISOR_TEST
    // Fire to actually remove the mappings from the tlb.
    tlb_service_dev(d, tlb_order, map_array, tmp_buffer->n_pages);
#endif
    kfree(map_array);

    dbg_info("Successfully put user pages at gva %llx consisting of %llu pages for cpid %d\n",
             tmp_buffer->vaddr, tmp_buffer->n_hpages, tmp_buffer->cpid);
    return 0;
}

/**
 * @brief Put all card and kernel pages and therefore allow evicitions,
 * as described by the notifier. The notifier contains the vaddr of the
 * region that should be put. This function searches the hash table
 * for a corresponding mapping and uses the unmap_entry function
 * to unmap the corresponding buffer. Afterwards it removes
 * the mapping from the hashtable.
 *
 * @param md mediated device
 * @param notifier notifier passed from the vm, copied into kernelspace
 * @return int 0 if successfull
 */
int hypervisor_tlb_put_user_pages(struct m_fpga_dev *md, struct hypervisor_map_notifier *notifier)
{
    struct user_pages *tmp_buff;
    struct fpga_dev *d;
    struct bus_drvdata *pd;

    uint64_t vaddr;
    uint64_t dirtied;
    int32_t cpid;

    BUG_ON(!md);
    d = md->fpga;
    BUG_ON(!d);
    pd = d->pd;

    BUG_ON(!notifier);
    vaddr = notifier->gva;
    dirtied = notifier->dirtied;
    cpid = notifier->cpid;

    // Find all user mappings that map to the same bucket as
    // vaddr.
    hash_for_each_possible(md->sbuff_map, tmp_buff, entry, vaddr)
    {
        // Check if it is the correct entry
        if (tmp_buff->vaddr == vaddr && tmp_buff->cpid == cpid)
        {
            // unmap from TLB
            unmap_entry(md, tmp_buff, dirtied);
            // delete from hashtable
            hash_del(&tmp_buff->entry);
            // free memory
            kfree(tmp_buff->hpages);
            kfree(tmp_buff);
        }
    }
    return 0;
}

/**
 * @brief Similar to hypervisor_tlb_put_user_pages but put all pages
 * hold by the device md.
 *
 * @param md mediated device
 * @param dirtied indicates if all pages should be marked dirty before putting
 * @return int 0 if successfull
 */
int hypervisor_tlb_put_user_pages_all(struct m_fpga_dev *md, int dirtied)
{
    struct fpga_dev *d;
    struct bus_drvdata *pd;
    int bkt;
    struct user_pages *tmp_buff;

    BUG_ON(!md);
    d = md->fpga;
    BUG_ON(!d);
    pd = d->pd;
    BUG_ON(!pd);

    // Iterate through all mappings and unmap all
    hash_for_each(md->sbuff_map, bkt, tmp_buff, entry)
    {
        // unmap from TLB
        unmap_entry(md, tmp_buff, dirtied);
        // delete from hash table
        hash_del(&tmp_buff->entry);
        // free memory
        kfree(tmp_buff->hpages);
        kfree(tmp_buff);
    }
    return 0;
}