hrev54102 adds 2 changesets to branch 'master'
old head: c1c239fefb6198f014b0d62c9de459fee3f509dc
new head: 18e9c8885f3c45110425f8d7508f5f7c71ee0a36
overview:
https://git.haiku-os.org/haiku/log/?qt=range&q=18e9c8885f3c+%5Ec1c239fefb61
----------------------------------------------------------------------------
1b3ccbc50b6d: nvme_disk: Overhaul to use physical-buffer-based I/O.
This fixes the virtual/physical mixup problems that plagued
the driver until now.
This is rather inefficent, though, as it does its own page-based
bouncing instead of using DMAResource. That will come in the
next commit.
18e9c8885f3c: nvme_disk: Use DMAResource for bouncing.
This is a "best of both worlds" approach: if nvme_disk
determines the I/O can be done with no bouncing at all,
it will do so; otherwise, the I/O is done by the _bounce
method that uses DMAResource.
Thanks to mmlr for helping investigate some of the DMAResource
crashes and other oddities.
Fixes #15818 and #15123.
[ Augustin Cavalier <waddlesplash@xxxxxxxxx> ]
----------------------------------------------------------------------------
2 files changed, 350 insertions(+), 122 deletions(-)
.../drivers/disk/nvme/compat/libnvme_haiku.cpp | 2 +-
.../kernel/drivers/disk/nvme/nvme_disk.cpp | 470 ++++++++++++++-----
############################################################################
Commit: 1b3ccbc50b6d00ea8c3d779cb1fd33522ccb95b6
URL: https://git.haiku-os.org/haiku/commit/?id=1b3ccbc50b6d
Author: Augustin Cavalier <waddlesplash@xxxxxxxxx>
Date: Tue Apr 28 02:49:51 2020 UTC
nvme_disk: Overhaul to use physical-buffer-based I/O.
This fixes the virtual/physical mixup problems that plagued
the driver until now.
This is rather inefficent, though, as it does its own page-based
bouncing instead of using DMAResource. That will come in the
next commit.
----------------------------------------------------------------------------
diff --git a/src/add-ons/kernel/drivers/disk/nvme/compat/libnvme_haiku.cpp
b/src/add-ons/kernel/drivers/disk/nvme/compat/libnvme_haiku.cpp
index 29d25e221f..af959e62fc 100644
--- a/src/add-ons/kernel/drivers/disk/nvme/compat/libnvme_haiku.cpp
+++ b/src/add-ons/kernel/drivers/disk/nvme/compat/libnvme_haiku.cpp
@@ -81,7 +81,7 @@ phys_addr_t
nvme_mem_vtophys(void* vaddr)
{
physical_entry entry;
- status_t status = get_memory_map((void*)vaddr, 0, &entry, 1);
+ status_t status = get_memory_map((void*)vaddr, 1, &entry, 1);
if (status != B_OK) {
panic("nvme: get_memory_map failed for %p: %s\n",
(void*)vaddr, strerror(status));
diff --git a/src/add-ons/kernel/drivers/disk/nvme/nvme_disk.cpp
b/src/add-ons/kernel/drivers/disk/nvme/nvme_disk.cpp
index b7b67e4712..324d2d9521 100644
--- a/src/add-ons/kernel/drivers/disk/nvme/nvme_disk.cpp
+++ b/src/add-ons/kernel/drivers/disk/nvme/nvme_disk.cpp
@@ -19,6 +19,9 @@
#include <fs/devfs.h>
#include <bus/PCI.h>
#include <PCI_x86.h>
+#include <vm/vm.h>
+
+#include "IORequest.h"
extern "C" {
#include <libnvme/nvme.h>
@@ -202,6 +205,7 @@ nvme_disk_init_device(void* _info, void** _cookie)
int err = nvme_ctrlr_stat(info->ctrlr, &cstat);
if (err != 0) {
TRACE_ERROR("failed to get controller information!\n");
+ nvme_ctrlr_close(info->ctrlr);
return err;
}
@@ -316,6 +320,7 @@ nvme_disk_uninit_device(void* _cookie)
nvme_ctrlr_close(info->ctrlr);
// TODO: Deallocate MSI(-X).
+ // TODO: Deallocate PCI.
}
@@ -358,7 +363,7 @@ nvme_disk_free(void* cookie)
}
-// #pragma mark - I/O functions
+// #pragma mark - I/O
static int32
@@ -371,7 +376,7 @@ nvme_interrupt_handler(void* _info)
static qpair_info*
-get_next_qpair(nvme_disk_driver_info* info)
+get_qpair(nvme_disk_driver_info* info)
{
return &info->qpairs[atomic_add((int32*)&info->next_qpair, 1)
% info->qpair_count];
@@ -379,7 +384,7 @@ get_next_qpair(nvme_disk_driver_info* info)
static void
-disk_io_callback(status_t* status, const struct nvme_cpl* cpl)
+io_finished_callback(status_t* status, const struct nvme_cpl* cpl)
{
*status = nvme_cpl_is_error(cpl) ? B_IO_ERROR : B_OK;
}
@@ -388,6 +393,8 @@ disk_io_callback(status_t* status, const struct nvme_cpl*
cpl)
static void
await_status(nvme_disk_driver_info* info, struct nvme_qpair* qpair, status_t&
status)
{
+ CALLED();
+
ConditionVariableEntry entry;
int timeouts = 0;
while (status == EINPROGRESS) {
@@ -416,163 +423,427 @@ await_status(nvme_disk_driver_info* info, struct
nvme_qpair* qpair, status_t& st
}
-static status_t
-do_nvme_io(nvme_disk_driver_info* info, off_t rounded_pos, void* buffer,
- size_t* rounded_len, bool write = false)
+struct nvme_io_request {
+ status_t status;
+
+ bool write;
+
+ off_t lba_start;
+ size_t lba_count;
+
+ physical_entry* iovecs;
+ int32 iovec_count;
+
+ int32 iovec_i;
+};
+
+
+void ior_reset_sgl(nvme_io_request* request, uint32_t offset)
{
- CALLED();
- const size_t block_size = info->block_size;
+ request->iovec_i = offset;
+}
- status_t status = EINPROGRESS;
- qpair_info* qpinfo = get_next_qpair(info);
+int ior_next_sge(nvme_io_request* request, uint64_t* address, uint32_t* length)
+{
+ int32 index = request->iovec_i;
+ if (index < 0 || index > request->iovec_count)
+ return -1;
+
+ *address = request->iovecs[index].address;
+ *length = request->iovecs[index].size;
+
+ TRACE("IOV %d: 0x%" B_PRIxPHYSADDR ", %d\n", request->iovec_i,
*address, (int)*length);
+
+ request->iovec_i++;
+ return 0;
+}
+
+
+static status_t
+do_nvme_io_request(nvme_disk_driver_info* info, nvme_io_request* request)
+{
+ request->status = EINPROGRESS;
+
+ qpair_info* qpinfo = get_qpair(info);
int ret = -1;
- if (write) {
- ret = nvme_ns_write(info->ns, qpinfo->qpair, buffer,
- rounded_pos / block_size, *rounded_len / block_size,
- (nvme_cmd_cb)disk_io_callback, &status, 0);
+ if (request->write) {
+ ret = nvme_ns_writev(info->ns, qpinfo->qpair,
request->lba_start,
+ request->lba_count, (nvme_cmd_cb)io_finished_callback,
request,
+ 0, (nvme_req_reset_sgl_cb)ior_reset_sgl,
+ (nvme_req_next_sge_cb)ior_next_sge);
} else {
- ret = nvme_ns_read(info->ns, qpinfo->qpair, buffer,
- rounded_pos / block_size, *rounded_len / block_size,
- (nvme_cmd_cb)disk_io_callback, &status, 0);
+ ret = nvme_ns_readv(info->ns, qpinfo->qpair, request->lba_start,
+ request->lba_count, (nvme_cmd_cb)io_finished_callback,
request,
+ 0, (nvme_req_reset_sgl_cb)ior_reset_sgl,
+ (nvme_req_next_sge_cb)ior_next_sge);
}
if (ret != 0) {
- TRACE_ERROR("attempt to queue %s I/O at %" B_PRIdOFF " of %"
B_PRIuSIZE
- " bytes failed!\n", write ? "write" : "read",
rounded_pos, *rounded_len);
+ TRACE_ERROR("attempt to queue %s I/O at LBA %" B_PRIdOFF " of
%" B_PRIuSIZE
+ " blocks failed!\n", request->write ? "write" : "read",
+ request->lba_start, request->lba_count);
+
+ request->lba_count = 0;
return ret;
}
- await_status(info, qpinfo->qpair, status);
+ await_status(info, qpinfo->qpair, request->status);
- if (status != B_OK) {
- TRACE_ERROR("%s at %" B_PRIdOFF " of %" B_PRIuSIZE " bytes
failed!\n",
- write ? "write" : "read", rounded_pos, *rounded_len);
- *rounded_len = 0;
+ if (request->status != B_OK) {
+ TRACE_ERROR("%s at LBA %" B_PRIdOFF " of %" B_PRIuSIZE
+ " blocks failed!\n", request->write ? "write" : "read",
+ request->lba_start, request->lba_count);
+
+ request->lba_count = 0;
+ }
+ return request->status;
+}
+
+
+static status_t bounce_copy(bool write, int8* buffer, phys_size_t&
buffer_offset,
+ phys_size_t& buffer_remaining, int32& vec_i, phys_size_t& vec_offset,
+ physical_entry* iovecs, int32 vec_count)
+{
+ status_t status = B_OK;
+ while (buffer_remaining > 0 && vec_i < vec_count) {
+ phys_size_t len = min_c(iovecs[vec_i].size - vec_offset,
+ buffer_remaining);
+ if (write) {
+ status = vm_memcpy_from_physical(buffer + buffer_offset,
+ iovecs[vec_i].address + vec_offset, len, false);
+ } else {
+ status = vm_memcpy_to_physical(iovecs[vec_i].address +
vec_offset,
+ buffer + buffer_offset, len, false);
+ }
+ if (status != B_OK)
+ break;
+
+ vec_offset += len;
+ if (vec_offset == iovecs[vec_i].size) {
+ vec_i++;
+ vec_offset = 0;
+ }
+ buffer_remaining -= len;
+ buffer_offset += len;
}
return status;
}
static status_t
-nvme_disk_read(void* cookie, off_t pos, void* buffer, size_t* length)
+nvme_disk_bounced_io(nvme_disk_handle* handle, io_request* request,
+ off_t rounded_pos, phys_size_t rounded_len,
+ physical_entry* iovecs, int32 count)
{
- CALLED();
- nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
const size_t block_size = handle->info->block_size;
+ const phys_size_t buffer_size = B_PAGE_SIZE,
+ blocks_per_buffer = buffer_size / block_size;
+
+ phys_addr_t phys_buffer;
+ phys_size_t buffer_offset = (request->Offset() - rounded_pos);
+ int8* buffer = (int8*)nvme_mem_alloc_node(buffer_size, buffer_size, 0,
+ &phys_buffer);
+ if (buffer == NULL)
+ return B_NO_MEMORY;
- // libnvme does transfers in units of device sectors, so if we have to
- // round either the position or the length, we will need a bounce
buffer.
- const off_t rounded_pos = ROUNDDOWN(pos, block_size);
- size_t rounded_len = ROUNDUP((*length) + (pos - rounded_pos),
block_size);
- if (rounded_pos != pos || rounded_len != *length
- || IS_USER_ADDRESS(buffer)) {
- void* bounceBuffer = malloc(rounded_len);
- MemoryDeleter _(bounceBuffer);
- if (bounceBuffer == NULL) {
- *length = 0;
- return B_NO_MEMORY;
+ status_t status = B_OK;
+ if (request->IsWrite() && request->Offset() != rounded_pos) {
+ // Read in the first block.
+ nvme_io_request first;
+ memset(&first, 0, sizeof(nvme_io_request));
+
+ first.lba_start = rounded_pos / block_size;
+ first.lba_count = 1;
+
+ physical_entry entry;
+ entry.address = phys_buffer;
+ entry.size = block_size;
+ first.iovecs = &entry;
+ first.iovec_count = 1;
+
+ status = do_nvme_io_request(handle->info, &first);
+ }
+
+ if (status != B_OK) {
+ nvme_free(buffer);
+ return status;
+ }
+
+ nvme_io_request nvme_request;
+ memset(&nvme_request, 0, sizeof(nvme_io_request));
+ physical_entry request_entry;
+ request_entry.address = phys_buffer;
+ nvme_request.iovecs = &request_entry;
+ nvme_request.iovec_count = 1;
+
+ off_t lba_offset = rounded_pos / block_size;
+ phys_size_t lba_remaining = rounded_len / block_size, iovec_offset = 0;
+ int32 iovec = 0;
+ while (lba_remaining > 0 && status == B_OK) {
+ phys_size_t buffer_remaining = buffer_size - buffer_offset;
+ if (request->IsWrite() && lba_remaining <= blocks_per_buffer
+ && ((rounded_len - request->Length()) -
(request->Offset() - rounded_pos)) > 0) {
+ // Read in the last block.
+ nvme_io_request last;
+ memset(&last, 0, sizeof(nvme_io_request));
+
+ last.lba_start = lba_offset + lba_remaining - 1;
+ last.lba_count = 1;
+
+ physical_entry entry;
+ entry.address = phys_buffer + ((lba_remaining - 1) *
block_size);
+ entry.size = block_size;
+ last.iovecs = &entry;
+ last.iovec_count = 1;
+
+ status = do_nvme_io_request(handle->info, &last);
+ if (status != B_OK)
+ break;
}
- status_t status = do_nvme_io(handle->info, rounded_pos,
- bounceBuffer, &rounded_len);
- if (status != B_OK) {
- // The "rounded_len" will be the actual transferred
length, but
- // of course it will contain the padding.
- *length = std::min(*length, (size_t)std::max((off_t)0,
- (off_t)rounded_len - (off_t)(pos -
rounded_pos)));
- if (*length == 0)
- return status;
+ if (request->IsWrite()) {
+ status = bounce_copy(true, buffer, buffer_offset,
buffer_remaining,
+ iovec, iovec_offset, iovecs, count);
+ if (status != B_OK)
+ break;
+ request_entry.size = ROUNDUP(buffer_size -
buffer_remaining, block_size);
+ } else {
+ request_entry.size = min_c(lba_remaining * block_size,
buffer_size);
}
- void* offsetBuffer = (void*)((addr_t)bounceBuffer + (pos -
rounded_pos));
- if (IS_USER_ADDRESS(buffer))
- status = user_memcpy(buffer, offsetBuffer, *length);
- else
- memcpy(buffer, offsetBuffer, *length);
- return status;
+ nvme_request.lba_start = lba_offset;
+ nvme_request.lba_count = request_entry.size / block_size;
+
+ status = do_nvme_io_request(handle->info, &nvme_request);
+ if (status != B_OK)
+ break;
+
+ lba_offset += nvme_request.lba_count;
+ lba_remaining -= nvme_request.lba_count;
+
+ if (request->IsRead()) {
+ status = bounce_copy(false, buffer, buffer_offset,
buffer_remaining,
+ iovec, iovec_offset, iovecs, count);
+ }
+
+ buffer_offset = 0;
}
- // If we got here, that means the arguments are already rounded to LBAs
- // and the buffer is a kernel one, so just do the I/O directly.
- return do_nvme_io(handle->info, pos, buffer, length);
+ // We're done with the bounce buffer.
+ nvme_free(buffer);
+
+ // Determine how much was actually transferred relative to the request.
+ const off_t endOffset = request->Offset() + request->Length(),
+ trueEndOffset = lba_offset * block_size;
+ generic_size_t transferred;
+ if (trueEndOffset >= endOffset)
+ transferred = request->Length();
+ else
+ transferred = trueEndOffset - request->Offset();
+
+ request->SetTransferredBytes(transferred < request->Length(),
transferred);
+ return status;
}
static status_t
-nvme_disk_write(void* cookie, off_t pos, const void* buffer, size_t* length)
+nvme_disk_io(void* cookie, io_request* request)
{
CALLED();
+
nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
- const size_t block_size = handle->info->block_size;
- const off_t rounded_pos = ROUNDDOWN(pos, block_size);
- size_t rounded_len = ROUNDUP((*length) + (pos - rounded_pos),
block_size);
- if (rounded_pos != pos || rounded_len != *length
- || IS_USER_ADDRESS(buffer)) {
- void* bounceBuffer = malloc(rounded_len);
- MemoryDeleter _(bounceBuffer);
- if (bounceBuffer == NULL) {
- *length = 0;
+ nvme_io_request nvme_request;
+ memset(&nvme_request, 0, sizeof(nvme_io_request));
+
+ nvme_request.write = request->IsWrite();
+
+ physical_entry* vtophys = NULL;
+ MemoryDeleter vtophysDeleter;
+
+ IOBuffer* buffer = request->Buffer();
+ status_t status = B_OK;
+ if (!buffer->IsPhysical()) {
+ status = buffer->LockMemory(request->TeamID(),
request->IsWrite());
+ if (status != B_OK) {
+ TRACE_ERROR("failed to lock memory: %s\n",
strerror(status));
+ return status;
+ }
+ // SetStatusAndNotify() takes care of unlocking memory if
necessary.
+
+ // This is slightly inefficient, as we could use a
BStackOrHeapArray in
+ // the optimal case (few physical entries required), but we
would not
+ // know whether or not that was possible until calling
get_memory_map()
+ // and then potentially reallocating, which would complicate
the logic.
+
+ int32 vtophys_length = (request->Length() / B_PAGE_SIZE) + 2;
+ nvme_request.iovecs = vtophys =
(physical_entry*)malloc(sizeof(physical_entry)
+ * vtophys_length);
+ if (vtophys == NULL) {
+ TRACE_ERROR("failed to allocate memory for iovecs\n");
+ request->SetStatusAndNotify(B_NO_MEMORY);
return B_NO_MEMORY;
}
-
- // If we rounded, we must protect against simulaneous writes
inside
- // the rounded blocks, as otherwise data corruption would occur.
- WriteLocker writeLocker;
- ReadLocker readLocker;
- if (rounded_pos != pos || rounded_len != *length)
- writeLocker.SetTo(handle->info->rounded_write_lock,
false);
- else
- readLocker.SetTo(handle->info->rounded_write_lock,
false);
-
- // Since we rounded, we need to read in the first and last
logical
- // blocks before we copy our information to the bounce buffer.
- // TODO: This would be faster if we queued both reads at once!
- size_t readlen = block_size;
- status_t status = B_OK;
- if (rounded_pos != pos) {
- status = do_nvme_io(handle->info, rounded_pos,
bounceBuffer,
- &readlen);
- if (status != B_OK) {
- *length = 0;
- return status;
+ vtophysDeleter.SetTo(vtophys);
+
+ for (size_t i = 0; i < buffer->VecCount(); i++) {
+ generic_io_vec virt = buffer->VecAt(i);
+ uint32 entries = vtophys_length -
nvme_request.iovec_count;
+
+ // Avoid copies by going straight into the vtophys
array.
+ status = get_memory_map_etc(request->TeamID(),
(void*)virt.base,
+ virt.length, vtophys +
nvme_request.iovec_count, &entries);
+ if (status == B_BUFFER_OVERFLOW) {
+ TRACE("vtophys array was too small,
reallocating\n");
+
+ vtophysDeleter.Detach();
+ vtophys_length *= 2;
+ nvme_request.iovecs = vtophys =
(physical_entry*)realloc(vtophys,
+ sizeof(physical_entry) *
vtophys_length);
+ vtophysDeleter.SetTo(vtophys);
+ if (vtophys == NULL) {
+ status = B_NO_MEMORY;
+ } else {
+ // Try again, with the larger buffer
this time.
+ i--;
+ continue;
+ }
}
- }
- if (rounded_len > block_size) {
- off_t offset = rounded_len - block_size;
- status = do_nvme_io(handle->info, rounded_pos + offset,
- ((int8*)bounceBuffer) + offset, &readlen);
if (status != B_OK) {
- *length = 0;
+ TRACE_ERROR("I/O get_memory_map failed: %s\n",
strerror(status));
+ request->SetStatusAndNotify(status);
return status;
}
- }
- // Now we can copy in the actual data to be written.
- void* offsetBuffer = (void*)((addr_t)bounceBuffer + (pos -
rounded_pos));
- if (IS_USER_ADDRESS(buffer))
- status = user_memcpy(offsetBuffer, buffer, *length);
- else
- memcpy(offsetBuffer, buffer, *length);
- if (status != B_OK) {
- *length = 0;
- return status;
+ nvme_request.iovec_count += entries;
}
+ } else {
+ nvme_request.iovecs = (physical_entry*)buffer->Vecs();
+ nvme_request.iovec_count = buffer->VecCount();
+ }
- status = do_nvme_io(handle->info, rounded_pos, bounceBuffer,
- &rounded_len, true);
- if (status != B_OK) {
- *length = std::min(*length, (size_t)std::max((off_t)0,
- (off_t)rounded_len - (off_t)(pos -
rounded_pos)));
- }
+ // See if we need to bounce anything other than the first or last vec.
+ const size_t block_size = handle->info->block_size;
+ bool bounceAll = false;
+ for (int32 i = 1; !bounceAll && i < (nvme_request.iovec_count - 1);
i++) {
+ if ((nvme_request.iovecs[i].address % B_PAGE_SIZE) != 0)
+ bounceAll = true;
+ if ((nvme_request.iovecs[i].size % B_PAGE_SIZE) != 0)
+ bounceAll = true;
+ }
+
+ // See if we need to bounce due to the first or last vec.
+ if (nvme_request.iovec_count > 1) {
+ physical_entry* entry = &nvme_request.iovecs[0];
+ if (!bounceAll && (((entry->address + entry->size) %
B_PAGE_SIZE) != 0
+ || (entry->size % block_size) != 0))
+ bounceAll = true;
+
+ entry = &nvme_request.iovecs[nvme_request.iovec_count - 1];
+ if (!bounceAll && ((entry->address % B_PAGE_SIZE) != 0
+ || (entry->size % block_size) != 0))
+ bounceAll = true;
+ }
+
+ // See if we need to bounce due to rounding.
+ const off_t rounded_pos = ROUNDDOWN(request->Offset(), block_size);
+ phys_size_t rounded_len = ROUNDUP(request->Length() + (request->Offset()
+ - rounded_pos), block_size);
+ if (rounded_pos != request->Offset() || rounded_len !=
request->Length())
+ bounceAll = true;
+
+ if (bounceAll) {
+ // Let the bounced I/O routine take care of everything from
here.
+ status = nvme_disk_bounced_io(handle, request, rounded_pos,
rounded_len,
+ nvme_request.iovecs, nvme_request.iovec_count);
+ request->SetStatusAndNotify(status);
+ return status;
+ }
+
+ nvme_request.lba_start = rounded_pos / block_size;
+ nvme_request.lba_count = rounded_len / block_size;
+
+ // No bouncing was required.
+ ReadLocker readLocker;
+ if (nvme_request.write)
+ readLocker.SetTo(handle->info->rounded_write_lock, false);
+
+ // Error check before actually doing I/O.
+ if (status != B_OK) {
+ TRACE_ERROR("I/O failed early: %s\n", strerror(status));
+ request->SetStatusAndNotify(status);
return status;
}
- ReadLocker _(handle->info->rounded_write_lock);
+ int32 remaining = nvme_request.iovec_count;
+ while (remaining > 0 && status == B_OK) {
+ nvme_request.iovec_count = min_c(remaining,
+ NVME_MAX_SGL_DESCRIPTORS / 2);
- // If we got here, that means the arguments are already rounded to LBAs,
- // so just do the I/O directly.
- return do_nvme_io(handle->info, pos, (void*)buffer, length, true);
+ nvme_request.lba_count = 0;
+ for (int i = 0; i < nvme_request.iovec_count; i++)
+ nvme_request.lba_count += (nvme_request.iovecs[i].size
/ block_size);
+
+ status = do_nvme_io_request(handle->info, &nvme_request);
+
+ nvme_request.iovecs += nvme_request.iovec_count;
+ remaining -= nvme_request.iovec_count;
+ nvme_request.lba_start += nvme_request.lba_count;
+ }
+
+ if (status != B_OK)
+ TRACE_ERROR("I/O failed: %s\n", strerror(status));
+
+ request->SetTransferredBytes(status != B_OK,
+ (nvme_request.lba_start * block_size) - rounded_pos);
+ request->SetStatusAndNotify(status);
+ return status;
+}
+
+
+static status_t
+nvme_disk_read(void* cookie, off_t pos, void* buffer, size_t* length)
+{
+ CALLED();
+ nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
+
+ const off_t end = (handle->info->capacity * handle->info->block_size);
+ if (pos >= end)
+ return B_BAD_VALUE;
+ if (pos + (off_t)*length > end)
+ *length = end - pos;
+
+ IORequest request;
+ status_t status = request.Init(pos, (addr_t)buffer, *length, false, 0);
+ if (status != B_OK)
+ return status;
+
+ status = nvme_disk_io(handle, &request);
+ *length = request.TransferredBytes();
+ return status;
+}
+
+
+static status_t
+nvme_disk_write(void* cookie, off_t pos, const void* buffer, size_t* length)
+{
+ CALLED();
+ nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
+
+ const off_t end = (handle->info->capacity * handle->info->block_size);
+ if (pos >= end)
+ return B_BAD_VALUE;
+ if (pos + (off_t)*length > end)
+ *length = end - pos;
+
+ IORequest request;
+ status_t status = request.Init(pos, (addr_t)buffer, *length, true, 0);
+ if (status != B_OK)
+ return status;
+
+ status = nvme_disk_io(handle, &request);
+ *length = request.TransferredBytes();
+ return status;
}
@@ -581,9 +852,9 @@ nvme_disk_flush(nvme_disk_driver_info* info)
{
status_t status = EINPROGRESS;
- qpair_info* qpinfo = get_next_qpair(info);
+ qpair_info* qpinfo = get_qpair(info);
int ret = nvme_ns_flush(info->ns, qpinfo->qpair,
- (nvme_cmd_cb)disk_io_callback, &status);
+ (nvme_cmd_cb)io_finished_callback, &status);
if (ret != 0)
return ret;
@@ -783,7 +1054,7 @@ struct device_module_info sNvmeDiskDevice = {
nvme_disk_free,
nvme_disk_read,
nvme_disk_write,
- NULL,
+ nvme_disk_io,
nvme_disk_ioctl,
NULL, // select
############################################################################
Revision: hrev54102
Commit: 18e9c8885f3c45110425f8d7508f5f7c71ee0a36
URL: https://git.haiku-os.org/haiku/commit/?id=18e9c8885f3c
Author: Augustin Cavalier <waddlesplash@xxxxxxxxx>
Date: Tue Apr 28 03:37:01 2020 UTC
Ticket: https://dev.haiku-os.org/ticket/15123
Ticket: https://dev.haiku-os.org/ticket/15818
nvme_disk: Use DMAResource for bouncing.
This is a "best of both worlds" approach: if nvme_disk
determines the I/O can be done with no bouncing at all,
it will do so; otherwise, the I/O is done by the _bounce
method that uses DMAResource.
Thanks to mmlr for helping investigate some of the DMAResource
crashes and other oddities.
Fixes #15818 and #15123.
----------------------------------------------------------------------------
diff --git a/src/add-ons/kernel/drivers/disk/nvme/nvme_disk.cpp
b/src/add-ons/kernel/drivers/disk/nvme/nvme_disk.cpp
index 324d2d9521..b37404ea3c 100644
--- a/src/add-ons/kernel/drivers/disk/nvme/nvme_disk.cpp
+++ b/src/add-ons/kernel/drivers/disk/nvme/nvme_disk.cpp
@@ -94,6 +94,9 @@ typedef struct {
uint32 qpair_count;
uint32 next_qpair;
+ DMAResource dma_resource;
+ sem_id dma_buffers_sem;
+
rw_lock rounded_write_lock;
ConditionVariable interrupt;
@@ -217,6 +220,7 @@ nvme_disk_init_device(void* _info, void** _cookie)
info->ns = nvme_ns_open(info->ctrlr, cstat.ns_ids[0]);
if (info->ns == NULL) {
TRACE_ERROR("failed to open namespace!\n");
+ nvme_ctrlr_close(info->ctrlr);
return B_ERROR;
}
@@ -224,6 +228,7 @@ nvme_disk_init_device(void* _info, void** _cookie)
err = nvme_ns_stat(info->ns, &nsstat);
if (err != 0) {
TRACE_ERROR("failed to get namespace information!\n");
+ nvme_ctrlr_close(info->ctrlr);
return err;
}
@@ -245,9 +250,35 @@ nvme_disk_init_device(void* _info, void** _cookie)
}
if (info->qpair_count == 0) {
TRACE_ERROR("failed to allocate qpairs!\n");
+ nvme_ctrlr_close(info->ctrlr);
return B_NO_MEMORY;
}
+ // allocate DMA buffers
+ int buffers = info->qpair_count * 2;
+
+ dma_restrictions restrictions = {};
+ restrictions.alignment = B_PAGE_SIZE;
+ // Technically, the first and last segments in a transfer can be
+ // unaligned, and the rest only need to have sizes that are a
multiple
+ // of the block size.
+ restrictions.max_segment_count = (NVME_MAX_SGL_DESCRIPTORS / 2);
+ restrictions.max_transfer_size = cstat.max_xfer_size;
+
+ err = info->dma_resource.Init(restrictions, B_PAGE_SIZE, buffers,
buffers);
+ if (err != 0) {
+ TRACE_ERROR("failed to initialize DMA resource!\n");
+ nvme_ctrlr_close(info->ctrlr);
+ return err;
+ }
+
+ info->dma_buffers_sem = create_sem(buffers, "nvme buffers sem");
+ if (info->dma_buffers_sem < 0) {
+ TRACE_ERROR("failed to create DMA buffers semaphore!\n");
+ nvme_ctrlr_close(info->ctrlr);
+ return info->dma_buffers_sem;
+ }
+
// set up rounded-write lock
rw_lock_init(&info->rounded_write_lock, "nvme rounded writes");
@@ -453,7 +484,8 @@ int ior_next_sge(nvme_io_request* request, uint64_t*
address, uint32_t* length)
*address = request->iovecs[index].address;
*length = request->iovecs[index].size;
- TRACE("IOV %d: 0x%" B_PRIxPHYSADDR ", %d\n", request->iovec_i,
*address, (int)*length);
+ TRACE("IOV %d: 0x%" B_PRIx64 ", %" B_PRIu32 "\n", request->iovec_i,
*address,
+ *length);
request->iovec_i++;
return 0;
@@ -500,148 +532,77 @@ do_nvme_io_request(nvme_disk_driver_info* info,
nvme_io_request* request)
}
-static status_t bounce_copy(bool write, int8* buffer, phys_size_t&
buffer_offset,
- phys_size_t& buffer_remaining, int32& vec_i, phys_size_t& vec_offset,
- physical_entry* iovecs, int32 vec_count)
+static status_t
+nvme_disk_bounced_io(nvme_disk_handle* handle, io_request* request)
{
- status_t status = B_OK;
- while (buffer_remaining > 0 && vec_i < vec_count) {
- phys_size_t len = min_c(iovecs[vec_i].size - vec_offset,
- buffer_remaining);
- if (write) {
- status = vm_memcpy_from_physical(buffer + buffer_offset,
- iovecs[vec_i].address + vec_offset, len, false);
- } else {
- status = vm_memcpy_to_physical(iovecs[vec_i].address +
vec_offset,
- buffer + buffer_offset, len, false);
- }
- if (status != B_OK)
- break;
+ CALLED();
- vec_offset += len;
- if (vec_offset == iovecs[vec_i].size) {
- vec_i++;
- vec_offset = 0;
- }
- buffer_remaining -= len;
- buffer_offset += len;
- }
- return status;
-}
+ WriteLocker writeLocker;
+ if (request->IsWrite())
+ writeLocker.SetTo(handle->info->rounded_write_lock, false);
+ status_t status = acquire_sem(handle->info->dma_buffers_sem);
+ if (status != B_OK) {
+ request->SetStatusAndNotify(status);
+ return status;
+ }
-static status_t
-nvme_disk_bounced_io(nvme_disk_handle* handle, io_request* request,
- off_t rounded_pos, phys_size_t rounded_len,
- physical_entry* iovecs, int32 count)
-{
const size_t block_size = handle->info->block_size;
- const phys_size_t buffer_size = B_PAGE_SIZE,
- blocks_per_buffer = buffer_size / block_size;
-
- phys_addr_t phys_buffer;
- phys_size_t buffer_offset = (request->Offset() - rounded_pos);
- int8* buffer = (int8*)nvme_mem_alloc_node(buffer_size, buffer_size, 0,
- &phys_buffer);
- if (buffer == NULL)
- return B_NO_MEMORY;
- status_t status = B_OK;
- if (request->IsWrite() && request->Offset() != rounded_pos) {
- // Read in the first block.
- nvme_io_request first;
- memset(&first, 0, sizeof(nvme_io_request));
+ TRACE("%p: IOR Offset: %" B_PRIdOFF "; Length %" B_PRIuGENADDR
+ "; Write %s\n", request, request->Offset(), request->Length(),
+ request->IsWrite() ? "yes" : "no");
- first.lba_start = rounded_pos / block_size;
- first.lba_count = 1;
+ nvme_io_request nvme_request;
+ while (request->RemainingBytes() > 0) {
+ IOOperation operation;
+ status = handle->info->dma_resource.TranslateNext(request,
&operation, 0);
+ if (status != B_OK)
+ break;
- physical_entry entry;
- entry.address = phys_buffer;
- entry.size = block_size;
- first.iovecs = &entry;
- first.iovec_count = 1;
+ size_t transferredBytes = 0;
+ do {
+ TRACE("%p: IOO offset: %" B_PRIdOFF ", length: %"
B_PRIuGENADDR
+ ", write: %s\n", request, operation.Offset(),
+ operation.Length(), operation.IsWrite() ? "yes"
: "no");
- status = do_nvme_io_request(handle->info, &first);
- }
+ nvme_request.write = operation.IsWrite();
+ nvme_request.lba_start = operation.Offset() /
block_size;
+ nvme_request.lba_count = operation.Length() /
block_size;
+ nvme_request.iovecs = (physical_entry*)operation.Vecs();
+ nvme_request.iovec_count = operation.VecCount();
- if (status != B_OK) {
- nvme_free(buffer);
- return status;
- }
+ status = do_nvme_io_request(handle->info,
&nvme_request);
+ if (status == B_OK && nvme_request.write ==
request->IsWrite())
+ transferredBytes += operation.OriginalLength();
- nvme_io_request nvme_request;
- memset(&nvme_request, 0, sizeof(nvme_io_request));
- physical_entry request_entry;
- request_entry.address = phys_buffer;
- nvme_request.iovecs = &request_entry;
- nvme_request.iovec_count = 1;
-
- off_t lba_offset = rounded_pos / block_size;
- phys_size_t lba_remaining = rounded_len / block_size, iovec_offset = 0;
- int32 iovec = 0;
- while (lba_remaining > 0 && status == B_OK) {
- phys_size_t buffer_remaining = buffer_size - buffer_offset;
- if (request->IsWrite() && lba_remaining <= blocks_per_buffer
- && ((rounded_len - request->Length()) -
(request->Offset() - rounded_pos)) > 0) {
- // Read in the last block.
- nvme_io_request last;
- memset(&last, 0, sizeof(nvme_io_request));
-
- last.lba_start = lba_offset + lba_remaining - 1;
- last.lba_count = 1;
-
- physical_entry entry;
- entry.address = phys_buffer + ((lba_remaining - 1) *
block_size);
- entry.size = block_size;
- last.iovecs = &entry;
- last.iovec_count = 1;
-
- status = do_nvme_io_request(handle->info, &last);
- if (status != B_OK)
- break;
- }
+ operation.SetStatus(status);
+ } while (status == B_OK && !operation.Finish());
- if (request->IsWrite()) {
- status = bounce_copy(true, buffer, buffer_offset,
buffer_remaining,
- iovec, iovec_offset, iovecs, count);
- if (status != B_OK)
- break;
- request_entry.size = ROUNDUP(buffer_size -
buffer_remaining, block_size);
- } else {
- request_entry.size = min_c(lba_remaining * block_size,
buffer_size);
+ if (status == B_OK && operation.Status() != B_OK) {
+ TRACE_ERROR("I/O succeeded but IOOperation failed!\n");
+ status = operation.Status();
}
- nvme_request.lba_start = lba_offset;
- nvme_request.lba_count = request_entry.size / block_size;
+ operation.SetTransferredBytes(transferredBytes);
+ request->OperationFinished(&operation, status, status != B_OK,
+ operation.OriginalOffset() + transferredBytes);
- status = do_nvme_io_request(handle->info, &nvme_request);
+ handle->info->dma_resource.RecycleBuffer(operation.Buffer());
+
+ TRACE("%p: status %s, remaining bytes %" B_PRIuGENADDR "\n",
request,
+ strerror(status), request->RemainingBytes());
if (status != B_OK)
break;
-
- lba_offset += nvme_request.lba_count;
- lba_remaining -= nvme_request.lba_count;
-
- if (request->IsRead()) {
- status = bounce_copy(false, buffer, buffer_offset,
buffer_remaining,
- iovec, iovec_offset, iovecs, count);
- }
-
- buffer_offset = 0;
}
- // We're done with the bounce buffer.
- nvme_free(buffer);
+ release_sem(handle->info->dma_buffers_sem);
- // Determine how much was actually transferred relative to the request.
- const off_t endOffset = request->Offset() + request->Length(),
- trueEndOffset = lba_offset * block_size;
- generic_size_t transferred;
- if (trueEndOffset >= endOffset)
- transferred = request->Length();
+ // Notify() also takes care of UnlockMemory().
+ if (status != B_OK && request->Status() == B_OK)
+ request->SetStatusAndNotify(status);
else
- transferred = trueEndOffset - request->Offset();
-
- request->SetTransferredBytes(transferred < request->Length(),
transferred);
+ request->NotifyFinished();
return status;
}
@@ -754,10 +715,7 @@ nvme_disk_io(void* cookie, io_request* request)
if (bounceAll) {
// Let the bounced I/O routine take care of everything from
here.
- status = nvme_disk_bounced_io(handle, request, rounded_pos,
rounded_len,
- nvme_request.iovecs, nvme_request.iovec_count);
- request->SetStatusAndNotify(status);
- return status;
+ return nvme_disk_bounced_io(handle, request);
}
nvme_request.lba_start = rounded_pos / block_size;
@@ -982,16 +940,15 @@ nvme_disk_init_driver(device_node* node, void** cookie)
return ret;
}
- nvme_disk_driver_info* info = (nvme_disk_driver_info*)malloc(
- sizeof(nvme_disk_driver_info));
+ nvme_disk_driver_info* info = new nvme_disk_driver_info;
if (info == NULL)
return B_NO_MEMORY;
- memset(info, 0, sizeof(*info));
-
info->media_status = B_OK;
info->node = node;
+ info->ctrlr = NULL;
+
*cookie = info;
return B_OK;
}