Jetpack/kernel/nvidia/drivers/virt/tegra/vm_err_sample_handler.c
dchvs 75c7968d30 Add Jetpack 4.4.1 sources
Jetson Xavier NX, Jetson TX2 Series, Jetson AGX Xavier Series, Jetson Nano, Jetson TX1 [L4T 32.4.4]
2021-01-19 20:45:17 -06:00

574 lines
17 KiB
C

/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#define pr_fmt(fmt) "vm-err-sample-handler: " fmt
#include <linux/module.h>
#include <linux/vm_err.h>
#define ESR_EC_INSTR_ABORT_LOWER_EL 0x20U
#define ESR_EC_DATA_ABORT_LOWER_EL 0x24U
#define ESR_DABT_ISS_ISV_MASK (1U << 24)
#define ESR_DABT_ISS_SAS_SHIFT 22U
#define ESR_DABT_ISS_SAS_MASK (3U << ESR_DABT_ISS_SAS_SHIFT)
#define ESR_DABT_ISS_FNV_MASK (1U << 10)
#define ESR_DABT_ISS_WNR_MASK (1U << 6)
#define ESR_DABT_ISS_DFSC_MASK 0x3F
/* Bridge error details:
* Note: These are redefined here only to allow user friendly messages
* describing the error.
* This must match with "Timeout error" value in t18x_axi_errors[]
* in nvidia/drivers/platform/tegra/bridge_mca.c
*/
static const unsigned int BRIDGE_ERROR_TIMEOUT = 18;
/* This must match with "CCPLEX" value in src_ids
* in nvidia/drivers/platform/tegra/bridge_mca.c
*/
static const unsigned int BRIDGE_SRC_ID_CCPLEX = 1;
/* This must match with corresponding HV definition in pct.h */
static const unsigned int GUEST_UNASSIGNED = 18;
static struct tegra_hv_vm_err_handlers handlers;
static struct tegra_hv_config config;
static void print_bridge_error(const struct err_data_t * const err_data)
{
const struct async_bridge_err_t * const br_err_data =
&err_data->async_bridge_err;
unsigned int protection;
pr_crit("Bridge error details\n");
pr_crit("--------------------------------------\n");
pr_crit("Err count %d: %s FAULT ADDR 0x%x status1 0x%x status2 0x%x\n",
br_err_data->count, br_err_data->br_name, br_err_data->err_addr,
br_err_data->err_status1, br_err_data->err_status2);
pr_crit("\tDirection: %s\n", br_err_data->rw ? "READ" : "WRITE");
pr_crit("\tBridge ID: 0x%x\n", br_err_data->br_id);
pr_crit("\tError type: %u %s\n",
br_err_data->err_type,
(br_err_data->err_type == BRIDGE_ERROR_TIMEOUT) ?
"(Timeout)" : "");
pr_crit("\tLength: %d\n", br_err_data->length);
protection = br_err_data->protection;
pr_crit("\tProtection: 0x%x %s %s %s access\n", protection,
(protection & 0x4) ? "Instruction" : "Data",
(protection & 0x2) ? "Non-Secure" : "Secure",
(protection & 0x1) ? "Privileged" : "Unprivileged");
pr_crit("\tSource ID: 0x%x -- %s\n",
br_err_data->src_id,
(br_err_data->src_id == BRIDGE_SRC_ID_CCPLEX) ?
" (CCPLEX)" : "");
pr_crit("\tAXI_ID: 0x%x\n", br_err_data->axi_id);
pr_crit("\tCache: 0x%x\n", br_err_data->cache);
pr_crit("\tBurst: 0x%x\n", br_err_data->burst);
pr_crit("--------------------------------------\n");
}
static void print_cache(uint32_t cache)
{
if ((cache & 0x3) == 0x0) {
pr_crit("\t Cache\t\t\t: 0x%x--Non-cacheable/Non-Bufferable)\n",
cache);
return;
}
if ((cache & 0x3) == 0x1) {
pr_crit("\t Cache\t\t\t: 0x%x -- Device\n",
cache);
return;
}
switch (cache) {
case 0x2:
pr_crit("\t Cache\t\t\t: 0x%x -- Cacheable/Non-Bufferable\n",
cache);
break;
case 0x3:
pr_crit("\t Cache\t\t\t: 0x%x -- Cacheable/Bufferable\n",
cache);
break;
default:
pr_crit("\t Cache\t\t\t: 0x%x -- Cacheable\n",
cache);
}
}
static void print_prot(uint32_t prot)
{
char *data_str;
char *secure_str;
char *priv_str;
data_str = (prot & 0x4) ? "Instruction" : "Data";
secure_str = (prot & 0x2) ? "Non-Secure" : "Secure";
priv_str = (prot & 0x1) ? "Privileged" : "Unprivileged";
pr_crit("\t Protection\t\t: 0x%x -- %s, %s, %s Access\n",
prot, priv_str, secure_str, data_str);
}
static void print_cbb_error(const struct err_data_t * const err_data)
{
const struct async_cbb_err_t * const cbb_err_data =
&err_data->async_cbb_err;
pr_crit("Control Back Bone(CBB) error details\n");
pr_crit("--------------------------------------\n");
pr_crit("Error:%s\n", cbb_err_data->cbb_name);
pr_crit("\tError Logger\t\t: %d\n", cbb_err_data->error_logger);
pr_crit("\tErrLog0\t\t\t: 0x%x\n", cbb_err_data->errlog0);
pr_crit("\t Transaction Type\t: %s\n", cbb_err_data->transaction_type);
pr_crit("\t Error Code\t\t: %s\n", cbb_err_data->error_code);
pr_crit("\t Error Source\t\t: %s\n",
cbb_err_data->error_source);
pr_crit("\t Error Description\t: %s\n",
cbb_err_data->error_description);
pr_crit("\t Packet header Lock\t: %d\n",
cbb_err_data->packet_header_lock);
pr_crit("\t Packet header Len1\t: %d\n",
cbb_err_data->packet_header_len1);
if (cbb_err_data->header_format)
pr_crit("\t NOC protocol version\t: %s\n", "version >= 2.7");
else
pr_crit("\t NOC protocol version\t: %s\n", "version < 2.7");
pr_crit("\tErrLog1\t\t\t: 0x%x\n", cbb_err_data->errlog1);
pr_crit("\tErrLog2\t\t\t: 0x%x\n", cbb_err_data->errlog2);
pr_crit("\t RouteId\t\t: 0x%llx\n", cbb_err_data->route_id);
pr_crit("\t InitFlow\t\t: %s\n", cbb_err_data->initflow);
pr_crit("\t Targflow\t\t: %s\n", cbb_err_data->targflow);
pr_crit("\t TargSubRange\t\t: %d\n", cbb_err_data->targ_subrange);
pr_crit("\t SeqId\t\t\t: %d\n", cbb_err_data->seqid);
pr_crit("\tErrLog3\t\t\t: 0x%x\n", cbb_err_data->errlog3);
pr_crit("\tErrLog4\t\t\t: 0x%x\n", cbb_err_data->errlog4);
pr_crit("\t Address\t\t: 0x%llx\n", cbb_err_data->address);
pr_crit("\tErrLog5\t\t\t: 0x%x\n", cbb_err_data->errlog5);
pr_crit("\t Master ID\t\t: %s\n", cbb_err_data->master_id);
pr_crit("\t Non-Modify\t\t: 0x%x\n", cbb_err_data->non_mod);
pr_crit("\t AXI ID\t\t: 0x%x\n", cbb_err_data->axi_id);
pr_crit("\t Security Group(GRPSEC): 0x%x\n",
cbb_err_data->security_group);
print_cache(cbb_err_data->cache);
print_prot(cbb_err_data->protection);
pr_crit("\t FALCONSEC\t\t: 0x%x\n", cbb_err_data->falconsec);
pr_crit("\t Virtual Queuing Channel(VQC): 0x%x\n",
cbb_err_data->virtual_q_channel);
pr_crit("--------------------------------------\n");
}
static void print_smmu_error(const struct err_data_t * const err_data,
const enum err_reason reason)
{
const struct async_smmu_err_t * const smmu_err_data =
&err_data->async_smmu_err;
pr_crit("SMMU error details\n");
pr_crit("--------------------------------------\n");
if (reason == REASON_ASYNC_SMMU_CB) {
pr_crit("SMMU Context Bank %u error. StreamID: %d\n",
smmu_err_data->cb_id, smmu_err_data->stream_id);
} else if (reason == REASON_ASYNC_SMMU_GLOBAL) {
pr_crit("Global SMMU fault. CB: %u. StreamID: %d\n",
smmu_err_data->cb_id, smmu_err_data->stream_id);
} else {
pr_crit("Unexpected fault reason %d\n", reason);
}
pr_crit("FSR: 0x%x; FAR: 0x%llx; FSYND0: 0x%x; FSYND1: 0x%x\n",
smmu_err_data->fsr, smmu_err_data->far,
smmu_err_data->fsynr0, smmu_err_data->fsynr1);
pr_crit("--------------------------------------\n");
}
static void print_t19x_mc_error(const struct err_data_t * const err_data)
{
const struct async_mc_err_t19x_t * const mc_err_data_t19x =
&err_data->async_mc_err_t19x;
pr_crit("Memory Controller error details\n");
pr_crit("--------------------------------------\n");
if (mc_err_data_t19x->vpr_violation) {
pr_crit("vpr base=%x:%x, size=%x, ctrl=%x, override:(%x, %x, %x, %x)\n",
mc_err_data_t19x->vpr_base[0],
mc_err_data_t19x->vpr_base[1],
mc_err_data_t19x->vpr_size,
mc_err_data_t19x->vpr_ctrl,
mc_err_data_t19x->vpr_override[0],
mc_err_data_t19x->vpr_override[1],
mc_err_data_t19x->vpr_override[2],
mc_err_data_t19x->vpr_override[3]);
}
if (mc_err_data_t19x->no_status) {
pr_crit("MC fault - no status: %s\n",
mc_err_data_t19x->fault_msg);
} else if (mc_err_data_t19x->two_status) {
pr_crit("MC fault - %s\n", mc_err_data_t19x->fault_msg);
pr_crit("status: 0x%08x status2: 0x%08llx\n",
mc_err_data_t19x->status,
(unsigned long long int)mc_err_data_t19x->address);
} else {
pr_crit("(%d) %s: %s\n", mc_err_data_t19x->client_swgid,
mc_err_data_t19x->client_name,
mc_err_data_t19x->fault_msg);
pr_crit(" status = 0x%08x; addr = 0x%08llx\n",
mc_err_data_t19x->status,
(unsigned long long int)mc_err_data_t19x->address);
pr_crit(" secure: %s, access-type: %s\n",
mc_err_data_t19x->secure ? "yes" : "no",
mc_err_data_t19x->write ? "write" : "read");
}
}
static void print_mc_error(const struct err_data_t * const err_data)
{
const struct async_mc_err_t * const mc_err_data =
&err_data->async_mc_err;
pr_crit("Memory Controller error details\n");
pr_crit("--------------------------------------\n");
pr_crit("mc_err: base: 0x%llx, int_status: 0x%08x; err_status: 0x%08x;"
" fault_addr: 0x%llx\n",
mc_err_data->ch_base, mc_err_data->int_status,
mc_err_data->err_status, mc_err_data->fault_addr);
pr_crit("vcpuid %u, client_id %u, peripheral_id %d\n",
mc_err_data->vcpuid, mc_err_data->client_id,
mc_err_data->peripheral_id);
pr_crit("--------------------------------------\n");
}
static void print_sync_full_details(const struct err_data_t *const err_data,
bool with_frame)
{
int i;
const struct sync_t * const sync = &err_data->sync;
if (with_frame) {
pr_crit("Synchronous exception, full details\n");
pr_crit("--------------------------------------\n");
}
pr_crit("Offending VCpu Id %u\n", sync->offending_vcpu_id);
pr_crit("(Following register validity depends on error context)\n");
pr_crit("ESR_EL2: 0x%08x\n", sync->esr_el2);
pr_crit("ELR_EL2: 0x%016llx\n", sync->elr_el2);
pr_crit("FAR_EL2: 0x%016llx\n", sync->far_el2);
pr_crit("HPFAR_EL2: 0x%016llx\n", sync->hpfar_el2);
pr_crit("PAR_HPFAR_EL2: 0x%016llx\n", sync->par_hpfar_el2);
pr_crit("SPSR_EL2: 0x%016llx\n", sync->spsr_el2);
pr_crit("ELR_EL1: 0x%016llx\n", sync->elr_el1);
pr_crit("FAR_EL1: 0x%016llx\n", sync->far_el1);
pr_crit("SPSR_EL1: 0x%016llx\n", sync->spsr_el1);
pr_crit("ESR_EL1: 0x%08x\n", sync->esr_el1);
pr_crit("Fault Instr: 0x%08x\n", sync->fault_instr);
pr_crit("General Purpose Registers\n");
for (i = 0; i < 30; i++)
pr_crit(" x%02u: 0x%016llx\n", i, sync->gpr_array[i]);
pr_crit(" x30: 0x%016llx (link register)\n", sync->gpr_array[30]);
if (with_frame)
pr_crit("--------------------------------------\n");
}
static inline uint8_t extract_dabt_iss_sas(const uint32_t esr)
{
return ((esr & ESR_DABT_ISS_SAS_MASK) >> ESR_DABT_ISS_SAS_SHIFT);
}
static void print_data_abort(const struct err_data_t *const err_data)
{
const struct sync_t * const data_abort = &err_data->sync;
uint32_t esr = data_abort->esr_el2;
pr_crit("Data abort details\n");
pr_crit("--------------------------------------\n");
// Check if instruction syndrome is valid.
if (esr & ESR_DABT_ISS_ISV_MASK) {
bool is_write = (esr & ESR_DABT_ISS_WNR_MASK) > 0;
uint8_t access_size = (1U << extract_dabt_iss_sas(esr));
(is_write) ? pr_crit("write access\n") :
pr_crit("read access\n");
pr_crit("access size %u\n", access_size);
}
// Check if both fault addresses are valid.
if ((!(esr & ESR_DABT_ISS_FNV_MASK)) &&
((esr & ESR_DABT_ISS_DFSC_MASK) < 13U)) {
uint64_t fault_addr = data_abort->hpfar_el2 << 8 |
(data_abort->far_el2 & 0xfffULL);
pr_crit("Fault address: 0x%llx\n", fault_addr);
}
pr_crit("Additional context:\n");
// Print rest of trap context
print_sync_full_details(err_data, false);
pr_crit("--------------------------------------\n");
}
static void print_instr_abort(const struct err_data_t *const err_data)
{
pr_crit("Instruction abort details\n");
pr_crit("--------------------------------------\n");
// Print trap context
print_sync_full_details(err_data, false);
pr_crit("--------------------------------------\n");
}
static inline uint32_t exception_class(const uint32_t esr)
{
uint32_t ret = esr & 0xFC000000U;
return (ret >> 26);
}
static void print_sync(const struct err_data_t *const err_data)
{
if (!(err_data->sync.is_filled)) {
pr_crit("Synchronous exception, no details available\n");
return;
}
/* For demo purposes, do some preprocessing for data aborts and detect
* instr abort class.
*
* Full details for an implementation that is more tailored to a
* specific use case to be found in the AArch64 reference manual.
*/
switch (exception_class(err_data->sync.esr_el2)) {
case ESR_EC_DATA_ABORT_LOWER_EL:
print_data_abort(err_data);
break;
case ESR_EC_INSTR_ABORT_LOWER_EL:
print_instr_abort(err_data);
break;
default:
print_sync_full_details(err_data, true);
break;
}
}
static bool handle_async_err_details(const struct err_data_t * const err_data)
{
bool enter_bad_mode;
if (err_data->err_type != ASYNC) {
pr_crit("%s: incorrect error type: %d\n", __func__,
err_data->err_type);
/* Unexpected error type. Enter bad mode. */
return true;
}
pr_info("%s: error reason: %s\n", __func__,
tegra_hv_err_reason_desc[err_data->err_reason]);
switch (err_data->err_reason) {
case REASON_ASYNC_BRIDGE:
print_bridge_error(err_data);
/* Bridge error may not be fatal */
enter_bad_mode = false;
break;
case REASON_ASYNC_CBB:
print_cbb_error(err_data);
/* CBB error may not be fatal */
enter_bad_mode = false;
break;
case REASON_ASYNC_SMMU_CB:
print_smmu_error(err_data, err_data->err_reason);
/* SMMU context bank error may not be fatal */
enter_bad_mode = false;
break;
case REASON_ASYNC_SMMU_GLOBAL:
print_smmu_error(err_data, err_data->err_reason);
/* Can't recover from global SMMU error. */
enter_bad_mode = true;
break;
case REASON_ASYNC_MC:
print_mc_error(err_data);
enter_bad_mode = false;
break;
case REASON_ASYNC_MC_T19X:
print_t19x_mc_error(err_data);
enter_bad_mode = false;
break;
default:
pr_crit("%s: unhandled error. Reason id %d\n", __func__,
err_data->err_reason);
enter_bad_mode = true;
break;
}
return enter_bad_mode;
}
static bool handle_sync_err_details(const struct err_data_t * const err_data)
{
pr_info("%s: error reason: %s\n", __func__,
tegra_hv_err_reason_desc[err_data->err_reason]);
print_sync(err_data);
/* Recovery from sync error could be impossible. Enter bad mode. */
return true;
}
static bool handle_peer_err_details(const struct err_data_t * const err_data)
{
bool enter_bad_mode;
const unsigned int offender = err_data->offending_guest_id;
if (offender >= config.num_guests) {
if (offender != GUEST_UNASSIGNED) {
pr_crit("%s: invalid offending peer guest id %u\n",
__func__, offender);
/* Unexpected. Cause reboot. */
return true;
}
pr_crit("%s: HV can't attribute error to any guest\n",
__func__);
} else
pr_crit("Peer error. Offending guest id = %u\n", offender);
pr_crit("Error Type: %s\n", (err_data->err_type == SYNC) ?
"Synchronous" : "Asynchronous");
if (err_data->err_reason >= REASON_ENUM_SIZE) {
pr_crit("%s: unexpected reason id %u\n", __func__,
err_data->err_reason);
/* Unexpected. Cause reboot. */
return true;
}
pr_crit("%s: error reason: %s\n", __func__,
tegra_hv_err_reason_desc[err_data->err_reason]);
switch (err_data->err_reason) {
case REASON_ASYNC_BRIDGE:
print_bridge_error(err_data);
enter_bad_mode = false;
break;
case REASON_ASYNC_CBB:
print_cbb_error(err_data);
enter_bad_mode = false;
break;
case REASON_ASYNC_SMMU_CB:
case REASON_ASYNC_SMMU_GLOBAL:
print_smmu_error(err_data, err_data->err_reason);
enter_bad_mode = false;
break;
case REASON_ASYNC_MC:
print_mc_error(err_data);
enter_bad_mode = false;
break;
case REASON_ASYNC_MC_T19X:
print_t19x_mc_error(err_data);
enter_bad_mode = false;
break;
case REASON_SYNC:
print_sync(err_data);
enter_bad_mode = false;
break;
default:
pr_crit("%s: unhandled error. Reason id %d\n", __func__,
err_data->err_reason);
enter_bad_mode = false;
break;
}
return enter_bad_mode;
}
static bool self_async_err_handler(const struct err_data_t *const err_data)
{
return handle_async_err_details(err_data);
}
static bool self_sync_err_handler(const struct err_data_t *const err_data)
{
return handle_sync_err_details(err_data);
}
static bool peer_err_handler(const struct err_data_t *const err_data)
{
return handle_peer_err_details(err_data);
}
static int hooks_init(void)
{
int ret;
handlers.fn_self_async = self_async_err_handler;
handlers.fn_self_sync = self_sync_err_handler;
handlers.fn_peer =
IS_ENABLED(CONFIG_TEGRA_EBP) ? NULL : peer_err_handler;
ret = tegra_hv_register_vm_err_hooks(&handlers);
if (ret)
return ret;
tegra_hv_get_config(&config);
pr_info("%s: Guest Id %u\n", __func__, config.guest_id_self);
/* EBP, being unprivileged, doesn't know about total guests */
if (IS_ENABLED(CONFIG_TEGRA_EBP) == 0)
pr_info("%s: Total guests %u\n", __func__, config.num_guests);
return 0;
}
static void hooks_exit(void)
{
struct tegra_hv_vm_err_handlers handlers;
handlers.fn_self_async = NULL;
handlers.fn_self_sync = NULL;
handlers.fn_peer = NULL;
tegra_hv_register_vm_err_hooks(&handlers);
}
subsys_initcall(hooks_init);
module_exit(hooks_exit);
MODULE_AUTHOR("Nvidia Corporation");
MODULE_LICENSE("GPL v2");
MODULE_DESCRIPTION("Sample VM Error Handler");