On Tue, 6 May 2008 15:38:13 -0700 "Darrick J. Wong" <djwong at us.ibm.com> wrote: > [resend due to truncation] > Refactor the registration function to shrink the macros. Let me know if > more aggressive de-macroing is desirable. > --- > New driver for power meters in IBM System X hardware, with a few > cleanups suggested by Anthony Liguori. > > Signed-off-by: Darrick J. Wong <djwong at us.ibm.com> > --- > > index 0000000..2fefaf5 > --- /dev/null > +++ b/Documentation/hwmon/ibmaem > @@ -0,0 +1,37 @@ > +Kernel driver ibmaem > +====================== > + > +Supported systems: > + * Any recent IBM System X server with Active Energy Manager support. > + This includes the x3350, x3550, x3650, x3655, x3755, x3850 M2, > + x3950 M2, and certain HS2x/LS2x/QS2x blades. The IPMI host interface > + driver ("ipmi-si") needs to be loaded for this driver to do anything. > + Prefix: 'ibmaem' > + Datasheet: Not available > + > +Author: Darrick J. Wong > + > +Description > +----------- > + > +This driver implements sensor reading support for the energy and power > +meters available on various IBM System X hardware through the BMC. All > +sensor banks will be exported as platform devices; this driver can talk > +to both v1 and v2 interfaces. This driver is completely separate from the > +older ibmpex driver. > + > +The v1 AEM interface has a simple set of features to monitor energy use. > +There is a register that displays an estimate of raw energy consumption > +since the last BMC reset, and a power sensor that returns average power > +use over a configurable interval. > + > +The v2 AEM interface is a bit more sophisticated, being able to present > +a wider range of energy and power use registers, the power cap as > +set by the AEM software, and temperature sensors. > + > +Special Features > +---------------- > + > +The "power_cap" value displays the current system power cap, as set by > +the Active Energy Manager software. Setting the power cap from the host > +is not currently supported. > diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig > index 4dc76bc..00ff533 100644 > --- a/drivers/hwmon/Kconfig > +++ b/drivers/hwmon/Kconfig > @@ -330,6 +330,20 @@ config SENSORS_CORETEMP > sensor inside your CPU. Supported all are all known variants > of Intel Core family. > > +config SENSORS_IBMAEM > + tristate "IBM Active Energy Manager temperature/power sensors and control" > + select IPMI_SI > + depends on IPMI_HANDLER hm. IMPI_SI depends on IPMI_HANDLER, so this looks OK. But this stuff explodes in our facs so often... > + help > + If you say yes here you get support for the temperature and > + power sensors and capping hardware in various IBM System X > + servers that support Active Energy Manager. This includes > + the x3350, x3550, x3650, x3655, x3755, x3850 M2, x3950 M2, > + and certain HS2x/LS2x/QS2x blades. > + > + This driver can also be built as a module. If so, the module > + will be called ibmaem. > + > > ... > > +static DEFINE_IDR(aem1_idr); > +static DEFINE_SPINLOCK(aem1_idr_lock); > +static DEFINE_IDR(aem2_idr); > +static DEFINE_SPINLOCK(aem2_idr_lock); > + > +struct aem_ipmi_data; > +struct aem1_data; > +struct aem2_data; > + > +static void aem_register_bmc(int iface, struct device *dev); > +static void aem_bmc_gone(int iface); > +static void aem_msg_handler(struct ipmi_recv_msg *msg, void *user_msg_data); > +static void aem_init_aem1(struct aem_ipmi_data *probe); > +static void aem1_delete(struct aem1_data *data); > +static void aem_init_aem2(struct aem_ipmi_data *probe); > +static void aem2_delete(struct aem2_data *data); > +static void aem1_remove_sensors(struct aem1_data *data); > +static int aem1_find_sensors(struct aem1_data *data); > +static void aem2_remove_sensors(struct aem2_data *data); > +static int aem2_find_sensors(struct aem2_data *data); If these were moved below the struct definitions, we wouldn't need the forward declarations of the structs. > +static struct device_driver aem_driver = { > + .name = DRVNAME, > + .bus = &platform_bus_type, > +}; > + > +struct aem_ipmi_data { > + struct completion read_complete; > + struct ipmi_addr address; > + ipmi_user_t user; > + int interface; > + > + struct kernel_ipmi_msg tx_message; > + long tx_msgid; > + > + void *rx_msg_data; > + unsigned short rx_msg_len; > + unsigned char rx_result; > + int rx_recv_type; > + > + struct device *bmc_device; > +}; > + > +struct aem_fw_data { > + struct device *hwmon_dev; > + struct platform_device *pdev; > + struct mutex lock; > + char valid; > + unsigned long last_updated; /* In jiffies */ > + u8 ver_major; > + u8 ver_minor; > + u8 module_handle; > + int id; > + struct aem_ipmi_data ipmi; > +}; > + > +struct aem_ro_sensor_template { > + char *label; > + ssize_t (*show)(struct device *dev, > + struct device_attribute *devattr, > + char *buf); > + int index; > +}; > + > +struct aem_rw_sensor_template { > + char *label; > + ssize_t (*show)(struct device *dev, > + struct device_attribute *devattr, > + char *buf); > + ssize_t (*set)(struct device *dev, > + struct device_attribute *devattr, > + const char *buf, size_t count); > + int index; > +}; > + > +struct aem1_data { > + struct aem_fw_data fw; > + struct list_head list; > + > + /* > + * Available sensors: > + * Energy meter > + * Power meter > + */ > + struct sensor_device_attribute sensors[AEM1_NUM_SENSORS]; > + > + /* energy use */ > + u64 energy[AEM1_NUM_ENERGY_REGS]; > + > + int power_period[AEM1_NUM_ENERGY_REGS]; > +}; It'd be nice to document the units in which these physical quantities are recorded. BTUs/fortnight? > +struct aem2_data { > + struct aem_fw_data fw; > + struct list_head list; > + > + /* > + * Available sensors: > + * Two energy meters > + * Two power meters > + * Two temperature sensors > + * Six powercap registers > + */ > + struct sensor_device_attribute sensors[AEM2_NUM_SENSORS]; > + > + /* energy use */ > + u64 energy[AEM2_NUM_ENERGY_REGS]; > + > + /* power caps */ > + u16 pcap[AEM2_NUM_PCAP_REGS]; > + > + /* exhaust temperature */ > + u8 temp[AEM2_NUM_TEMP_REGS]; > + > + /* power sampling interval */ > + int power_period[AEM2_NUM_ENERGY_REGS]; > +}; ditto. > +/* Data structures returned by the AEM firmware */ > +struct aem_iana_id { > + u8 bytes[3]; > +}; > +static struct aem_iana_id system_x_id = { > + .bytes = {0x4D, 0x4F, 0x00} > +}; > + > +/* These are used to find AEM1 instances */ > +struct aem_find_firmware_req { > + struct aem_iana_id id; > + u8 rsvd; > + u16 index; > + u16 module_type_id; > +} __attribute__ ((packed)); We have a __packed helper in compiler-gcc.h > +struct aem_find_firmware_resp { > + struct aem_iana_id id; > + u8 num_instances; > +} __attribute__ ((packed)); > + > +/* These are used to find AEM2 instances */ > +struct aem_find_instance_req { > + struct aem_iana_id id; > + u8 instance_number; > + u16 module_type_id; > +} __attribute__ ((packed)); > + > > ... > > +/* Probe a BMC for AEM firmware instances */ > +static void aem_register_bmc(int iface, struct device *dev) > +{ > + struct aem_ipmi_data probe; 184 bytes of stack. OK, but worth keeping an eye on. > + if (aem_init_ipmi_data(&probe, iface, dev)) > + return; > + > + aem_init_aem1(&probe); > + aem_init_aem2(&probe); > + > + ipmi_destroy_user(probe.user); > +} > + > > ... > > +/* Dispatch IPMI messages to callers */ > +static void aem_msg_handler(struct ipmi_recv_msg *msg, void *user_msg_data) > +{ > + unsigned short rx_len; > + struct aem_ipmi_data *data = (struct aem_ipmi_data *)user_msg_data; unneeded and undesirable cast of void*. > + if (msg->msgid != data->tx_msgid) { > + dev_err(data->bmc_device, "Mismatch between received msgid " > + "(%02x) and transmitted msgid (%02x)!\n", > + (int)msg->msgid, > + (int)data->tx_msgid); > + ipmi_free_recv_msg(msg); > + return; > + } > + > + data->rx_recv_type = msg->recv_type; > + if (msg->msg.data_len > 0) > + data->rx_result = msg->msg.data[0]; > + else > + data->rx_result = IPMI_UNKNOWN_ERR_COMPLETION_CODE; > + > + if (msg->msg.data_len > 1) { > + rx_len = msg->msg.data_len - 1; > + if (data->rx_msg_len < rx_len) > + rx_len = data->rx_msg_len; > + data->rx_msg_len = rx_len; > + memcpy(data->rx_msg_data, msg->msg.data + 1, data->rx_msg_len); > + } else > + data->rx_msg_len = 0; > + > + ipmi_free_recv_msg(msg); > + complete(&data->read_complete); > +} > + > > ... > > +/* Obtain an id */ > +#define AEM_IDR_GET(type) \ > +static int type##_idr_get(int *id) \ > +{ \ > + int i, err; \ > +\ > +again: \ > + if (unlikely(idr_pre_get(&type##_idr, GFP_KERNEL) == 0)) \ > + return -ENOMEM; \ > +\ > + spin_lock(&type##_idr_lock); \ > + err = idr_get_new(&type##_idr, NULL, &i); \ > + spin_unlock(&type##_idr_lock); \ > +\ > + if (unlikely(err == -EAGAIN)) \ > + goto again; \ > + else if (unlikely(err)) \ > + return err; \ > +\ > + *id = i & MAX_ID_MASK; \ > + return 0; \ > +} > + > +/* Release an object ID */ > +#define AEM_IDR_PUT(type) \ > +static void type##_idr_put(int id) \ > +{ \ > + spin_lock(&type##_idr_lock); \ > + idr_remove(&type##_idr, id); \ > + spin_unlock(&type##_idr_lock); \ > +} ick. > > ... > > +/* Find and initialize all AEM1 instances */ > +static void aem_init_aem1(struct aem_ipmi_data *probe) > +{ > + int num, i, err; > + > + num = aem_find_aem1_count(probe); > + for (i = 0; i < num; i++) { > + err = aem_init_aem1_inst(probe, i); > + if (err) { > + dev_err(probe->bmc_device, > + "Error %d initializing AEM1 0x%X\n", > + err, i); > + return; Should the error really be dropped on the floor? > + } > + } > +} > + > > ... > > +/* Find and initialize one AEM2 instance */ > +static int aem_init_aem2_inst(struct aem_ipmi_data *probe, > + struct aem_find_instance_resp *fi_resp) > +{ > + struct aem2_data *data; > + int i; > + int res = -ENOMEM; > + > + data = kzalloc(sizeof(*data), GFP_KERNEL); > + if (!data) > + return res; > + mutex_init(&data->fw.lock); > + > + /* Copy instance data */ > + data->fw.ver_major = fi_resp->major; > + data->fw.ver_minor = fi_resp->minor; > + data->fw.module_handle = fi_resp->module_handle; > + for (i = 0; i < AEM2_NUM_ENERGY_REGS; i++) > + data->power_period[i] = AEM_DEFAULT_POWER_INTERVAL; > + > + /* Create sub-device for this fw instance */ > + if (aem2_idr_get(&data->fw.id)) > + goto id_err; > + > + data->fw.pdev = platform_device_alloc(DRVNAME "2", data->fw.id); platform_device_alloc() can fail. > + data->fw.pdev->dev.driver = &aem_driver; which will result in an oops here. > + if (IS_ERR(data->fw.pdev)) platform_device_alloc() returns NULL on failure, not ERR_PTR. > + goto dev_err; > + > + res = platform_device_add(data->fw.pdev); > + if (res) > + goto ipmi_err; > + > + dev_set_drvdata(&data->fw.pdev->dev, &data->fw); > + > + /* Set up IPMI interface */ > + if (aem_init_ipmi_data(&data->fw.ipmi, probe->interface, > + probe->bmc_device)) > + goto ipmi_err; > + > + /* Register with hwmon */ > + data->fw.hwmon_dev = hwmon_device_register(&data->fw.pdev->dev); > + > + if (IS_ERR(data->fw.hwmon_dev)) { > + dev_err(&data->fw.pdev->dev, "Unable to register hwmon " > + "device for IPMI interface %d\n", > + probe->interface); > + goto hwmon_reg_err; > + } > + > + /* Find sensors */ > + if (aem2_find_sensors(data)) > + goto sensor_err; > + > + /* Add to our list of AEM2 devices */ > + list_add_tail(&data->list, &driver_data.aem2_devices); > + dev_info(data->fw.ipmi.bmc_device, "Found AEM v%d.%d at 0x%X\n", > + data->fw.ver_major, data->fw.ver_minor, > + data->fw.module_handle); > + return 0; > + > +sensor_err: > + hwmon_device_unregister(data->fw.hwmon_dev); > +hwmon_reg_err: > + ipmi_destroy_user(data->fw.ipmi.user); > +ipmi_err: > + dev_set_drvdata(&data->fw.pdev->dev, NULL); > + platform_device_unregister(data->fw.pdev); > +dev_err: > + aem2_idr_put(data->fw.id); > +id_err: > + kfree(data); > + > + return res; > +} > + > +/* Find and initialize all AEM2 instances */ > +static void aem_init_aem2(struct aem_ipmi_data *probe) > +{ > + struct aem_find_instance_resp fi_resp; > + int err; > + int i = 0; > + > + while (!aem_find_aem2(probe, &fi_resp, i)) { > + if (fi_resp.major != 2) { > + dev_err(probe->bmc_device, "Unknown AEM v%d; please " > + "report this to the maintainer.\n", > + fi_resp.major); > + i++; > + continue; > + } > + err = aem_init_aem2_inst(probe, &fi_resp); > + if (err) { > + dev_err(probe->bmc_device, > + "Error %d initializing AEM2 0x%X\n", > + err, fi_resp.module_handle); > + return; > + } > + i++; > + } > +} > + > +/* Delete an AEM2 instance */ > +AEM_DELETE(aem2) > + > +/* Sensor support functions */ > + > +/* Read a sensor value */ > +static int aem_read_sensor(struct aem_fw_data *fwdata, u8 elt, u8 reg, > + void *data, size_t size) > +{ > + int rs_size; > + struct aem_read_sensor_req rs_req; > + struct aem_read_sensor_resp *rs_resp; > + struct aem_ipmi_data *ipmi = &fwdata->ipmi; > + > + /* AEM registers are 1, 2, 4 or 8 bytes */ > + switch (size) { > + case 1: > + case 2: > + case 4: > + case 8: > + break; > + default: > + return -EINVAL; > + } > + > + rs_req.id = system_x_id; > + rs_req.module_handle = fwdata->module_handle; > + rs_req.element = elt; > + rs_req.subcommand = AEM_READ_REGISTER; > + rs_req.reg = reg; > + rs_req.rx_buf_size = size; > + > + ipmi->tx_message.cmd = AEM_ELEMENT_CMD; > + ipmi->tx_message.data = (char *)&rs_req; > + ipmi->tx_message.data_len = sizeof(rs_req); > + > + rs_size = sizeof(*rs_resp) + size; > + rs_resp = kzalloc(rs_size, GFP_KERNEL); > + if (!rs_resp) > + return -ENOMEM; > + > + ipmi->rx_msg_data = rs_resp; > + ipmi->rx_msg_len = rs_size; > + > + aem_send_message(ipmi); > + > + wait_for_completion(&ipmi->read_complete); > + > + if (ipmi->rx_result || ipmi->rx_msg_len != rs_size || > + memcmp(&rs_resp->id, &system_x_id, sizeof(system_x_id))) { > + kfree(rs_resp); > + return -ENOENT; > + } > + > + switch (size) { > + case 1: { > + u8 *x = data; > + *x = rs_resp->bytes[0]; > + break; > + } > + case 2: { > + u16 *x = data; > + *x = be16_to_cpup((u16 *)rs_resp->bytes); > + break; > + } > + case 4: { > + u32 *x = data; > + *x = be32_to_cpup((u32 *)rs_resp->bytes); > + break; > + } > + case 8: { > + u64 *x = data; > + *x = be64_to_cpup((u64 *)rs_resp->bytes); > + break; > + } > + } > + > + return 0; > +} > + > +/* Update AEM1 energy sensors */ > +static void update_aem1_energy_sensors(struct aem1_data *data) > +{ > + aem_read_sensor(&data->fw, AEM_ENERGY_ELEMENT, 0, &data->energy, 8); > +} > + > +/* Update all AEM1 sensors */ > +static void update_aem1_sensors(struct aem1_data *data) > +{ > + mutex_lock(&data->fw.lock); > + if (time_before(jiffies, data->fw.last_updated + REFRESH_INTERVAL) && > + data->fw.valid) > + goto out; > + > + update_aem1_energy_sensors(data); > +out: > + mutex_unlock(&data->fw.lock); > +} > + > +/* Update AEM2 energy sensors */ > +static void update_aem2_energy_sensors(struct aem2_data *data) > +{ > + aem_read_sensor(&data->fw, AEM_ENERGY_ELEMENT, 0, &data->energy[0], 8); > + aem_read_sensor(&data->fw, AEM_ENERGY_ELEMENT, 1, &data->energy[1], 8); > +} > + > +/* Update all AEM2 sensors */ > +static void update_aem2_sensors(struct aem2_data *data) > +{ > + int i; > + > + mutex_lock(&data->fw.lock); > + if (time_before(jiffies, data->fw.last_updated + REFRESH_INTERVAL) && > + data->fw.valid) > + goto out; > + > + update_aem2_energy_sensors(data); > + aem_read_sensor(&data->fw, AEM_EXHAUST_ELEMENT, 0, &data->temp[0], 1); > + aem_read_sensor(&data->fw, AEM_EXHAUST_ELEMENT, 1, &data->temp[1], 1); > + > + for (i = POWER_CAP; i <= POWER_AUX; i++) > + aem_read_sensor(&data->fw, AEM_POWER_CAP_ELEMENT, i, > + &data->pcap[i], 2); > +out: > + mutex_unlock(&data->fw.lock); > +} > + > +/* sysfs support functions */ > + > +#define AEM_SHOW_POWER(type) \ > +static ssize_t type##_show_power(struct device *dev, \ > + struct device_attribute *devattr, \ > + char *buf) \ > +{ \ > + struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr); \ > + struct aem_fw_data *fwdata = dev_get_drvdata(dev); \ > + struct type##_data * a = container_of(fwdata, \ > + struct type##_data, fw); \ > + u64 before, after; \ > + signed long leftover; \ > +\ > + mutex_lock(&fwdata->lock); \ > + update_##type##_energy_sensors(a); \ > + before = a->energy[attr->index]; \ > +\ > + leftover = schedule_timeout_interruptible( \ > + msecs_to_jiffies(a->power_period[attr->index]) \ > + ); \ > + if (leftover) { \ > + mutex_unlock(&fwdata->lock); \ > + return 0; \ > + } \ > +\ > + update_##type##_energy_sensors(a); \ > + after = a->energy[attr->index]; \ > + mutex_unlock(&fwdata->lock); \ > +\ > + return sprintf(buf, "%llu\n", (after - before) * 1000000 / \ > + a->power_period[attr->index]); \ > +} > + > +/* Display energy use */ > +#define AEM_SHOW_ENERGY(type) \ > +static ssize_t type##_show_energy(struct device *dev, \ > + struct device_attribute *devattr, \ > + char *buf) \ > +{ \ > + struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr); \ > + struct aem_fw_data *fwdata = dev_get_drvdata(dev); \ > + struct type##_data * a = container_of(fwdata, struct type##_data, fw); \ > + update_##type##_sensors(a); \ > +\ > + return sprintf(buf, "%llu\n", a->energy[attr->index] * 1000); \ > +} > + > +/* Display power interval registers */ > +#define AEM_SHOW_POWER_PERIOD(type) \ > +static ssize_t type##_show_power_period(struct device *dev, \ > + struct device_attribute *devattr, \ > + char *buf) \ > +{ \ > + struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr); \ > + struct aem_fw_data *fwdata = dev_get_drvdata(dev); \ > + struct type##_data * a = container_of(fwdata, struct type##_data, fw); \ > + update_##type##_sensors(a); \ > +\ > + return sprintf(buf, "%d\n", a->power_period[attr->index]); \ > +} > + > +/* Set power interval registers */ > +#define AEM_SET_POWER_PERIOD(type) \ > +static ssize_t type##_set_power_period(struct device *dev, \ > + struct device_attribute *devattr, \ > + const char *buf, size_t count) \ > +{ \ > + struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr); \ > + struct aem_fw_data *fwdata = dev_get_drvdata(dev); \ > + struct type##_data * a = container_of(fwdata, struct type##_data, fw); \ > + int temp = simple_strtol(buf, NULL, 10); \ > +\ > + if (temp < AEM_MIN_POWER_INTERVAL) \ > + return -EINVAL; \ > +\ > + mutex_lock(&a->fw.lock); \ > + a->power_period[attr->index] = temp; \ > + mutex_unlock(&a->fw.lock); \ > +\ > + return count; \ > +} > + > +/* Remove sensors attached to an AEM device */ > +#define AEM_REMOVE_SENSORS(type, num_sensors) \ > +static void type##_remove_sensors(struct type##_data *data) \ > +{ \ > + int i; \ > +\ > + for (i = 0; i < num_sensors; i++) { \ > + if (!data->sensors[i].dev_attr.attr.name) \ > + continue; \ > + device_remove_file(&data->fw.pdev->dev, \ > + &data->sensors[i].dev_attr); \ > + } \ > +\ > + device_remove_file(&data->fw.pdev->dev, \ > + &sensor_dev_attr_name.dev_attr); \ > +} ho hum.