Rework of the old PCI code

 * adds support for VPP native PCI drivers using
   standard uio_pci_generic kernel driver
 * adds generic PCI interrupt callback
 * splits code to generic PCI handling and linux specific
 * adds new debug cli 'show pci [all]'

Change-Id: I447c2285e319e9725d70688c1b70c9dedda51fdc
Signed-off-by: Damjan Marion <damarion@cisco.com>
diff --git a/vlib/Makefile.am b/vlib/Makefile.am
index 17b23df..981c2be 100644
--- a/vlib/Makefile.am
+++ b/vlib/Makefile.am
@@ -37,6 +37,8 @@
   vlib/node.c					\
   vlib/node_cli.c				\
   vlib/node_format.c				\
+  vlib/pci/pci.c				\
+  vlib/pci/linux_pci.c				\
   vlib/threads.c				\
   vlib/trace.c
 
@@ -83,14 +85,12 @@
   vlib/unix/plugin.c				\
   vlib/unix/plugin.h				\
   vlib/unix/physmem.c				\
-  vlib/unix/pci.c				\
   vlib/unix/util.c
 
 nobase_include_HEADERS +=			\
   vlib/unix/cj.h				\
   vlib/unix/mc_socket.h				\
   vlib/unix/physmem.h				\
-  vlib/unix/pci.h				\
   vlib/unix/plugin.h				\
   vlib/unix/unix.h
     
diff --git a/vlib/vlib/unix/pci.c b/vlib/vlib/pci/linux_pci.c
similarity index 70%
rename from vlib/vlib/unix/pci.c
rename to vlib/vlib/pci/linux_pci.c
index b28b542..65b4111 100644
--- a/vlib/vlib/unix/pci.c
+++ b/vlib/vlib/pci/linux_pci.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Copyright (c) 2016 Cisco and/or its affiliates.
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at:
@@ -40,7 +40,6 @@
 #include <vlib/vlib.h>
 #include <vlib/pci/pci.h>
 #include <vlib/unix/unix.h>
-#include <vlib/unix/pci.h>
 
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -51,6 +50,48 @@
 #include <linux/ethtool.h>
 #include <linux/sockios.h>
 
+typedef struct {
+  /* /sys/bus/pci/devices/... directory name for this device. */
+  u8 * dev_dir_name;
+
+  /* Resource file descriptors. */
+  int * resource_fds;
+
+  /* File descriptor for config space read/write. */
+  int config_fd;
+
+  /* File descriptor for /dev/uio%d */
+  int uio_fd;
+
+  /* Minor device for uio device. */
+  u32 uio_minor;
+
+  /* Index given by unix_file_add. */
+  u32 unix_file_index;
+
+} linux_pci_device_t;
+
+/* Pool of PCI devices. */
+typedef struct {
+  vlib_main_t * vlib_main;
+  linux_pci_device_t * linux_pci_devices;
+} linux_pci_main_t;
+
+extern linux_pci_main_t linux_pci_main;
+
+always_inline linux_pci_device_t *
+pci_dev_for_linux (vlib_pci_device_t * dev)
+{
+  linux_pci_main_t * pm = &linux_pci_main;
+  return pool_elt_at_index (pm->linux_pci_devices, dev->os_handle);
+}
+
+/* Call to allocate/initialize the pci subsystem.
+   This is not an init function so that users can explicitly enable
+   pci only when it's needed. */
+clib_error_t * pci_bus_init (vlib_main_t * vm);
+
+clib_error_t * vlib_pci_bind_to_uio (vlib_pci_device_t * d, char * uio_driver_name);
 
 linux_pci_main_t linux_pci_main;
 
@@ -127,15 +168,15 @@
   vec_reset_length (s);
 
   s = format (s, "%v/driver/unbind%c", dev_dir_name, 0);
-  write_sys_fs ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address);
+  vlib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address);
   vec_reset_length (s);
 
   s = format (s, "/sys/bus/pci/drivers/%s/new_id%c", uio_driver_name, 0);
-  write_sys_fs ((char *) s, "0x%04x 0x%04x", c->vendor_id, c->device_id);
+  vlib_sysfs_write ((char *) s, "0x%04x 0x%04x", c->vendor_id, c->device_id);
   vec_reset_length (s);
 
   s = format (s, "/sys/bus/pci/drivers/%s/bind%c", uio_driver_name, 0);
-  write_sys_fs ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address);
+  vlib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address);
 
 done:
   closedir (dir);
@@ -162,19 +203,19 @@
 
 static clib_error_t * linux_pci_uio_read_ready (unix_file_t * uf)
 {
-  linux_pci_main_t * pm = &linux_pci_main;
-  vlib_main_t * vm = pm->vlib_main;
-  linux_pci_device_t * l;
-  u32 li = uf->private_data;
+  vlib_pci_main_t * pm = &pci_main;
+  vlib_pci_device_t * d;
+  int __attribute__ ((unused)) rv;
 
-  l = pool_elt_at_index (pm->linux_pci_devices, li);
-  vlib_node_set_interrupt_pending (vm, l->device_input_node_index);
+  u32 icount;
+  rv = read(uf->file_descriptor, &icount, 4);
 
-  /* Let node know which device is interrupting. */
-  {
-    vlib_node_runtime_t * rt = vlib_node_get_runtime (vm, l->device_input_node_index);
-    rt->runtime_data[0] |= 1 << l->device_index;
-  }
+  d = pool_elt_at_index (pm->pci_devs, uf->private_data);
+
+  if (d->interrupt_handler)
+    d->interrupt_handler(d);
+
+  vlib_pci_intr_enable(d);
 
   return /* no error */ 0;
 }
@@ -186,67 +227,18 @@
   return clib_error_return (0, "pci device %d: error", error_index);
 }
 
-static uword pci_resource_size (uword os_handle, uword resource)
-{
-  linux_pci_main_t * pm = &linux_pci_main;
-  linux_pci_device_t * p;
-  u8 * file_name;
-  struct stat b;
-  uword result = 0;
-
-  p = pool_elt_at_index (pm->linux_pci_devices, os_handle);
-
-  file_name = format (0, "%v/resource%d%c", p->dev_dir_name, resource, 0);
-  if (stat ((char *) file_name, &b) >= 0)
-    result = b.st_size;
-  vec_free (file_name);
-  return result;
-}
-
-void os_add_pci_disable_interrupts_reg (uword os_handle, u32 resource,
-					u32 reg_offset, u32 reg_value)
-{
-  linux_pci_main_t * pm = &linux_pci_main;
-  linux_pci_device_t * l;
-  char * file_name;
-  clib_error_t * error;
-
-  l = pool_elt_at_index (pm->linux_pci_devices, os_handle);
-  ASSERT (resource == 0);
-  ASSERT (reg_offset < pci_resource_size (os_handle, resource));
-  file_name = (char *) format (0, "%s/disable_interrupt_regs%c", l->dev_dir_name, 0);
-  error = write_sys_fs (file_name, "%x %x", reg_offset, reg_value);
-  if (error)
-    clib_error_report (error);
-  vec_free (file_name);
-}
-
 static void add_device (vlib_pci_device_t * dev, linux_pci_device_t * pdev)
 {
-  linux_pci_main_t * pm = &linux_pci_main;
+  vlib_pci_main_t * pm = &pci_main;
+  linux_pci_main_t * lpm = &linux_pci_main;
   linux_pci_device_t * l;
-  pci_config_header_t * c;
-  u32 x[4];
-  clib_error_t * error;
 
-  c = &dev->config0.header;
-
-  pool_get (pm->linux_pci_devices, l);
+  pool_get (lpm->linux_pci_devices, l);
   l[0] = pdev[0];
 
   l->dev_dir_name = vec_dup (l->dev_dir_name);
 
-  dev->os_handle = l - pm->linux_pci_devices;
-
-  error = write_sys_fs ("/sys/bus/pci/drivers/uio_pci_dma/new_id",
-			"%x %x", c->vendor_id, c->device_id);
-  if (error)
-    clib_error_report (error);
-  error = write_sys_fs ("/sys/bus/pci/drivers/uio_pci_dma/bind",
-			"%04x:%02x:%02x.%x", x[0], x[1], x[2], x[3]);
-  /* Errors happen when re-binding so just ignore them. */
-  if (error)
-    clib_error_free (error);
+  dev->os_handle = l - lpm->linux_pci_devices;
 
   {
     u8 * uio_dir = format (0, "%s/uio", l->dev_dir_name);
@@ -269,11 +261,7 @@
     template.read_function = linux_pci_uio_read_ready;
     template.file_descriptor = l->uio_fd;
     template.error_function = linux_pci_uio_error_ready;
-    template.private_data = l - pm->linux_pci_devices;
-
-    /* To be filled in by driver. */
-    l->device_input_node_index = ~0;
-    l->device_index = 0;
+    template.private_data = dev - pm->pci_devs;
 
     l->unix_file_index = unix_file_add (um, &template);
   }
@@ -295,25 +283,22 @@
 
 /* Configuration space read/write. */
 clib_error_t *
-os_read_write_pci_config (uword os_handle,
-			  vlib_read_or_write_t read_or_write,
-			  uword address,
-			  void * data,
-			  u32 n_bytes)
+vlib_pci_read_write_config (vlib_pci_device_t * dev,
+			    vlib_read_or_write_t read_or_write,
+			    uword address,
+			    void * data,
+			    u32 n_bytes)
 {
-  linux_pci_main_t * pm = &linux_pci_main;
+  linux_pci_main_t * lpm = &linux_pci_main;
   linux_pci_device_t * p;
   int n;
 
-  p = pool_elt_at_index (pm->linux_pci_devices, os_handle);
-
-  if (address != lseek (p->config_fd, address, SEEK_SET))
-    return clib_error_return_unix (0, "seek offset %d", address);
+  p = pool_elt_at_index (lpm->linux_pci_devices, dev->os_handle);
 
   if (read_or_write == VLIB_READ)
-    n = read (p->config_fd, data, n_bytes);
+    n = pread (p->config_fd, data, n_bytes, address);
   else
-    n = write (p->config_fd, data, n_bytes);
+    n = pwrite (p->config_fd, data, n_bytes, address);
 
   if (n != n_bytes)
     return clib_error_return_unix (0, "%s",
@@ -382,44 +367,33 @@
 }
 
 clib_error_t *
-os_map_pci_resource (uword os_handle,
-		     u32 resource,
-		     void ** result)
+vlib_pci_map_resource (vlib_pci_device_t * dev,
+		       u32 resource,
+		       void ** result)
 {
-  return (os_map_pci_resource_internal (os_handle, resource, 0 /* addr */,
+  return (os_map_pci_resource_internal (dev->os_handle, resource, 0 /* addr */,
                                         result));
 }
 
 clib_error_t *
-os_map_pci_resource_fixed (uword os_handle,
-                           u32 resource,
-                           u8 *addr,
-                           void ** result)
+vlib_pci_map_resource_fixed (vlib_pci_device_t * dev,
+			    u32 resource,
+			    u8 *addr,
+			    void ** result)
 {
-  return (os_map_pci_resource_internal (os_handle, resource, addr, result));
+  return (os_map_pci_resource_internal (dev->os_handle, resource, addr, result));
 }
 
-void os_free_pci_device (uword os_handle)
+void vlib_pci_free_device (vlib_pci_device_t * dev)
 {
   linux_pci_main_t * pm = &linux_pci_main;
   linux_pci_device_t * l;
 
-  l = pool_elt_at_index (pm->linux_pci_devices, os_handle);
+  l = pool_elt_at_index (pm->linux_pci_devices, dev->os_handle);
   linux_pci_device_free (l);
   pool_put (pm->linux_pci_devices, l);
 }
 
-u8 * format_os_pci_handle (u8 * s, va_list * va)
-{
-  linux_pci_main_t * pm = &linux_pci_main;
-  uword os_pci_handle = va_arg (*va, uword);
-  linux_pci_device_t * l;
-
-  l = pool_elt_at_index (pm->linux_pci_devices, os_pci_handle);
-  return format (s, "%x/%x/%x", l->bus_address.bus,
-		 l->bus_address.slot, l->bus_address.function);
-}
-
 pci_device_registration_t * __attribute__((unused))
 pci_device_next_registered (pci_device_registration_t * r)
 {
@@ -432,52 +406,35 @@
   return clib_elf_section_data_next (r, i * sizeof (r->supported_devices[0]));
 }
 
-static inline u8 kernel_driver_installed (pci_device_registration_t *r)
-{
-  u8 * link_name;
-  struct stat b;
-
-  link_name = format (0, "/sys/bus/pci/drivers/%s", r->kernel_driver);
-  if (stat ((char *)link_name, &b) >= 0)
-    r->kernel_driver_running++;
-  else
-    r->kernel_driver_running=0;
-
-  vec_free (link_name);
-  return r->kernel_driver_running;
-}
-
 static clib_error_t *
 init_device_from_registered (vlib_main_t * vm,
 			     vlib_pci_device_t * dev,
 			     linux_pci_device_t * pdev)
 {
-  linux_pci_main_t * lpm = &linux_pci_main;
+  vlib_pci_main_t * pm = &pci_main;
   pci_device_registration_t * r;
   pci_device_id_t * i;
   pci_config_header_t * c;
+  clib_error_t * error;
 
   c = &dev->config0.header;
 
-  r = lpm->pci_device_registrations;
+  r = pm->pci_device_registrations;
 
   while (r)
     {
       for (i = r->supported_devices; i->vendor_id != 0; i++)
         if (i->vendor_id == c->vendor_id && i->device_id == c->device_id)
           {
-            if (r->kernel_driver && kernel_driver_installed(r))
-              {
-                if (r->kernel_driver_running == 1)
-                  {
-                    clib_warning("PCI device type [%04x:%04x] is busy!\n"
-                                 "\tUninstall the associated linux kernel "
-                                 "driver:  sudo rmmod %s",
-                                 c->vendor_id, c->device_id, r->kernel_driver);
-                  }
-                continue;
-              }
+	    error = vlib_pci_bind_to_uio (dev, "uio_pci_generic");
+	    if (error)
+	      {
+		clib_error_report (error);
+		continue;
+	      }
+
             add_device (dev, pdev);
+	    dev->interrupt_handler = r->interrupt_handler;
             return r->init_function (vm, dev);
           }
       r = r->next_registration;
@@ -499,7 +456,7 @@
 scan_device (void * arg, u8 * dev_dir_name, u8 * ignored)
 {
   vlib_main_t * vm = arg;
-  linux_pci_main_t * pm = &linux_pci_main;
+  vlib_pci_main_t * pm = &pci_main;
   int fd;
   u8 * f;
   clib_error_t * error = 0;
@@ -562,7 +519,6 @@
 
     unformat_free (&input);
 
-    pdev.bus_address = dev->bus_address;
   }
 
 
@@ -574,14 +530,60 @@
 
   error = init_device (vm, dev, &pdev);
 
+  vec_reset_length(f);
+  f = format (f, "%v/vpd%c", dev_dir_name, 0);
+  fd = open ((char *) f, O_RDONLY);
+  if (fd >=  0)
+    {
+      while (1)
+	{
+	  u8 tag[3];
+	  u8 * data = 0;
+	  int len;
+
+	  if (read (fd, &tag, 3) != 3)
+	    break;
+
+	  if (tag[0] != 0x82 && tag[0] != 0x90 && tag[0] != 0x91)
+	    break;
+
+	  len = (tag[2] << 8) | tag[1];
+	  vec_validate(data, len);
+
+	  if (read (fd, data, len) != len)
+	    {
+	      vec_free (data);
+	      break;
+	    }
+	  if (tag[0] == 0x82)
+	    dev->product_name = data;
+	  else if (tag[0] == 0x90)
+	    dev->vpd_r = data;
+	  else if (tag[0] == 0x91)
+	    dev->vpd_w = data;
+
+	  data = 0;
+	}
+      close (fd);
+    }
+
+  vec_reset_length(f);
+  f = format (f, "%v/driver%c", dev_dir_name, 0);
+  dev->driver_name = vlib_sysfs_link_to_name((char *) f);
+
+  dev->numa_node = -1;
+  vec_reset_length(f);
+  f = format (f, "%v/numa_node%c", dev_dir_name, 0);
+  vlib_sysfs_read ((char *) f, "%u", &dev->numa_node);
+
  done:
   vec_free (f);
   return error;
 }
 
-clib_error_t * pci_bus_init (vlib_main_t * vm)
+clib_error_t * linux_pci_init (vlib_main_t * vm)
 {
-  linux_pci_main_t * pm = &linux_pci_main;
+  vlib_pci_main_t * pm = &pci_main;
   clib_error_t * error;
 
   pm->vlib_main = vm;
@@ -601,4 +603,4 @@
   return error;
 }
 
-VLIB_INIT_FUNCTION (pci_bus_init);
+VLIB_INIT_FUNCTION (linux_pci_init);
diff --git a/vlib/vlib/pci/pci.c b/vlib/vlib/pci/pci.c
new file mode 100644
index 0000000..bbd1221
--- /dev/null
+++ b/vlib/vlib/pci/pci.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * pci.c: Linux user space PCI bus management.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/pci/pci.h>
+#include <vlib/unix/unix.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <sys/ioctl.h>
+#include <net/if.h>
+#include <linux/ethtool.h>
+#include <linux/sockios.h>
+
+vlib_pci_main_t pci_main;
+
+static clib_error_t *
+show_pci_fn (vlib_main_t * vm,
+       unformat_input_t * input,
+       vlib_cli_command_t * cmd)
+{
+  vlib_pci_main_t * pm = &pci_main;
+  vlib_pci_device_t * d;
+  pci_config_header_t * c;
+  int show_all = 0;
+  u8 * s = 0;
+
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (input, "all"))
+        show_all = 1;
+      else
+        return clib_error_return (0, "unknown input `%U'",
+                                  format_unformat_error, input);
+    }
+
+  vlib_cli_output (vm, "%-13s%-7s%-12s%-15s%-20s%-40s",
+                   "Address", "Socket", "VID:PID", "Link Speed", "Driver", "Product Name");
+
+  pool_foreach (d, pm->pci_devs, ({
+    c = &d->config0.header;
+
+    if (c->device_class != PCI_CLASS_NETWORK_ETHERNET && !show_all)
+      continue;
+
+    vec_reset_length (s);
+
+    if (d->numa_node >= 0)
+      s = format (s, "  %d", d->numa_node);
+
+    vlib_cli_output (vm, "%-13U%-7v%04x:%04x   %-15U%-20s%-40v",
+		     format_vlib_pci_addr, &d->bus_address, s,
+		     c->vendor_id, c->device_id,
+		     format_vlib_pci_link_speed, d,
+		     d->driver_name ? (char *) d->driver_name : "",
+		     d->product_name);
+  }));
+
+  vec_free(s);
+  return 0;
+}
+
+uword
+unformat_vlib_pci_addr (unformat_input_t * input, va_list * args)
+{
+  vlib_pci_addr_t * addr = va_arg (* args, vlib_pci_addr_t *);
+  u32 x[4];
+
+  if (!unformat (input, "%x:%x:%x.%x", &x[0], &x[1], &x[2], &x[3]))
+    return 0;
+
+  addr->domain   = x[0];
+  addr->bus      = x[1];
+  addr->slot     = x[2];
+  addr->function = x[3];
+
+  return 1;
+}
+
+u8 *
+format_vlib_pci_addr (u8 * s, va_list * va)
+{
+  vlib_pci_addr_t * addr = va_arg (* va, vlib_pci_addr_t *);
+  return format (s, "%04x:%02x:%02x.%x", addr->domain, addr->bus,
+		 addr->slot, addr->function);
+}
+
+u8 *
+format_vlib_pci_handle (u8 * s, va_list * va)
+{
+  vlib_pci_addr_t * addr = va_arg (* va, vlib_pci_addr_t *);
+  return format (s, "%x/%x/%x", addr->bus, addr->slot, addr->function);
+}
+
+u8 *
+format_vlib_pci_link_speed (u8 *s, va_list * va)
+{
+  vlib_pci_device_t * d = va_arg (* va, vlib_pci_device_t *);
+  pcie_config_regs_t * r = pci_config_find_capability (&d->config0, PCI_CAP_ID_PCIE);
+  int width;
+
+  if (!r)
+    return format (s, "unknown");
+
+  width = (r->link_status >> 4) & 0x3f;
+
+  if ((r->link_status & 0xf) == 1)
+    return format (s, "2.5 GT/s x%u", width);
+  if ((r->link_status & 0xf) == 2)
+    return format (s, "5.0 GT/s x%u", width);
+  if ((r->link_status & 0xf) == 3)
+    return format (s, "8.0 GT/s x%u", width);
+  return format (s, "unknown");
+}
+
+
+VLIB_CLI_COMMAND (show_pci_command, static) = {
+  .path = "show pci",
+  .short_help = "show pci [all]",
+  .function = show_pci_fn,
+};
+
+clib_error_t * pci_bus_init (vlib_main_t * vm)
+{
+  return 0;
+}
+
+VLIB_INIT_FUNCTION (pci_bus_init);
diff --git a/vlib/vlib/pci/pci.h b/vlib/vlib/pci/pci.h
index 737e28e..7e8a9fc 100644
--- a/vlib/vlib/pci/pci.h
+++ b/vlib/vlib/pci/pci.h
@@ -53,7 +53,7 @@
    u32 as_u32;
 }) vlib_pci_addr_t;
 
-typedef struct {
+typedef struct vlib_pci_device {
   /* Operating system handle for this device. */
   uword os_handle;
 
@@ -65,6 +65,24 @@
     pci_config_type1_regs_t config1;
     u8 config_data[256];
   };
+
+  /* Interrupt handler */
+  void (* interrupt_handler) (struct vlib_pci_device * dev);
+
+  /* Driver name */
+  u8 * driver_name;
+
+  /* Numa Node */
+  int numa_node;
+
+  /* Vital Product Data */
+  u8 * product_name;
+  u8 * vpd_r;
+  u8 * vpd_w;
+
+  /* Private data */
+  uword private_data;
+
 } vlib_pci_device_t;
 
 typedef struct {
@@ -75,8 +93,8 @@
   /* Driver init function. */
   clib_error_t * (* init_function) (vlib_main_t * vm, vlib_pci_device_t * dev);
 
-  char const *kernel_driver;
-  u8 kernel_driver_running;
+  /* Interrupt handler */
+  void (* interrupt_handler) (vlib_pci_device_t * dev);
 
   /* List of registrations */
   struct _pci_device_registration * next_registration;
@@ -85,33 +103,46 @@
   pci_device_id_t supported_devices[];
 } pci_device_registration_t;
 
+/* Pool of PCI devices. */
+typedef struct {
+  vlib_main_t * vlib_main;
+  vlib_pci_device_t * pci_devs;
+  pci_device_registration_t * pci_device_registrations;
+  uword * pci_dev_index_by_pci_addr;
+} vlib_pci_main_t;
+
+extern vlib_pci_main_t pci_main;
+
 #define PCI_REGISTER_DEVICE(x,...)                              \
     __VA_ARGS__ pci_device_registration_t x;                    \
 static void __vlib_add_pci_device_registration_##x (void)       \
     __attribute__((__constructor__)) ;                          \
 static void __vlib_add_pci_device_registration_##x (void)       \
 {                                                               \
-    linux_pci_main_t * lpm = vlib_unix_get_main();              \
-    x.next_registration = lpm->pci_device_registrations;        \
-    lpm->pci_device_registrations = &x;                         \
+    vlib_pci_main_t * pm = &pci_main;                           \
+    x.next_registration = pm->pci_device_registrations;         \
+    pm->pci_device_registrations = &x;                          \
 }                                                               \
-__VA_ARGS__ pci_device_registration_t x 
+__VA_ARGS__ pci_device_registration_t x
 
+clib_error_t *
+vlib_pci_bind_to_uio (vlib_pci_device_t * d, char * uio_driver_name);
 
 /* Configuration space read/write. */
 clib_error_t *
-os_read_write_pci_config (uword os_handle,
-			  vlib_read_or_write_t read_or_write,
-			  uword address,
-			  void * data,
-			  u32 n_bytes);
+vlib_pci_read_write_config (vlib_pci_device_t * dev,
+			    vlib_read_or_write_t read_or_write,
+			    uword address,
+			    void * data,
+			    u32 n_bytes);
 
 #define _(t)								\
 static inline clib_error_t *						\
-os_read_pci_config_##t (uword os_handle, uword address, t * data)	\
+vlib_pci_read_config_##t (vlib_pci_device_t * dev,			\
+			  uword address, t * data)			\
 {									\
-  return os_read_write_pci_config (os_handle, VLIB_READ,		\
-				   address, data, sizeof (data[0]));	\
+  return vlib_pci_read_write_config (dev, VLIB_READ,address, data,	\
+				     sizeof (data[0]));			\
 }
 
 _ (u32);
@@ -122,9 +153,10 @@
 
 #define _(t)								\
 static inline clib_error_t *						\
-os_write_pci_config_##t (uword os_handle, uword address, t * data)	\
+vlib_pci_write_config_##t (vlib_pci_device_t * dev, uword address,	\
+			   t * data)					\
 {									\
-  return os_read_write_pci_config (os_handle, VLIB_WRITE,		\
+  return vlib_pci_read_write_config (dev, VLIB_WRITE,			\
 				   address, data, sizeof (data[0]));	\
 }
 
@@ -134,43 +166,72 @@
 
 #undef _
 
-clib_error_t *
-os_map_pci_resource (uword os_handle, u32 resource, void ** result);
+static inline clib_error_t *
+vlib_pci_intr_enable(vlib_pci_device_t * dev)
+{
+  u16 command;
+  clib_error_t * err;
+
+  err = vlib_pci_read_config_u16(dev, 4, &command);
+
+  if (err)
+    return err;
+
+  command &= ~PCI_COMMAND_INTX_DISABLE;
+
+  return vlib_pci_write_config_u16(dev, 4, &command);
+}
+
+static inline clib_error_t *
+vlib_pci_intr_disable(vlib_pci_device_t * dev)
+{
+  u16 command;
+  clib_error_t * err;
+
+  err = vlib_pci_read_config_u16(dev, 4, &command);
+
+  if (err)
+    return err;
+
+  command |= PCI_COMMAND_INTX_DISABLE;
+
+  return vlib_pci_write_config_u16(dev, 4, &command);
+}
+
+static inline clib_error_t *
+vlib_pci_bus_master_enable(vlib_pci_device_t * dev)
+{
+  clib_error_t * err;
+  u16 command;
+
+  /* Set bus master enable (BME) */
+  err = vlib_pci_read_config_u16(dev, 4, &command);
+
+  if (err)
+    return err;
+
+  if (!(command & PCI_COMMAND_BUS_MASTER))
+    return 0;
+
+  command |= PCI_COMMAND_BUS_MASTER;
+
+  return vlib_pci_write_config_u16(dev, 4, &command);
+}
 
 clib_error_t *
-os_map_pci_resource_fixed (uword os_handle, u32 resource, u8 * addr, 
+vlib_pci_map_resource (vlib_pci_device_t * dev, u32 resource, void ** result);
+
+clib_error_t *
+vlib_pci_map_resource_fixed (vlib_pci_device_t * dev, u32 resource, u8 * addr,
                            void ** result);
 
 /* Free's device. */
-void os_free_pci_device (uword os_handle);
+void
+vlib_pci_free_device (vlib_pci_device_t * dev);
 
-void os_add_pci_disable_interrupts_reg (uword os_handle, u32 resource, u32 reg_offset, u32 reg_value);
-
-format_function_t format_os_pci_handle;
-
-static inline uword
-unformat_vlib_pci_addr (unformat_input_t * input, va_list * args)
-{
-  vlib_pci_addr_t * addr = va_arg (* args, vlib_pci_addr_t *);
-  u32 x[4];
-
-  if (!unformat (input, "%x:%x:%x.%x", &x[0], &x[1], &x[2], &x[3]))
-    return 0;
-
-  addr->domain   = x[0];
-  addr->bus      = x[1];
-  addr->slot     = x[2];
-  addr->function = x[3];
-
-  return 1;
-}
-
-static inline u8 *
-format_vlib_pci_addr (u8 * s, va_list * va)
-{
-  vlib_pci_addr_t * addr = va_arg (* va, vlib_pci_addr_t *);
-  return format (s, "%04x:%02x:%02x.%x", addr->domain, addr->bus,
-		 addr->slot, addr->function);
-}
+unformat_function_t unformat_vlib_pci_addr;
+format_function_t format_vlib_pci_addr;
+format_function_t format_vlib_pci_handle;
+format_function_t format_vlib_pci_link_speed;
 
 #endif /* included_vlib_pci_h */
diff --git a/vlib/vlib/pci/pci_config.h b/vlib/vlib/pci/pci_config.h
index 9cada51..9d6f08a 100644
--- a/vlib/vlib/pci/pci_config.h
+++ b/vlib/vlib/pci/pci_config.h
@@ -646,6 +646,18 @@
 
   u16 root_capabilities;
   u32 root_status;
+
+  u32 dev_capabilities2;
+  u16 dev_control2;
+  u16 dev_status2;
+
+  u32 link_capabilities2;
+  u16 link_control2;
+  u16 link_status2;
+
+  u32 slot_capabilities2;
+  u16 slot_control2;
+  u16 slot_status2;
 }) pcie_config_regs_t;
 
 /* PCI express extended capabilities. */
diff --git a/vlib/vlib/unix/pci.h b/vlib/vlib/unix/pci.h
deleted file mode 100644
index dcbf1cf..0000000
--- a/vlib/vlib/unix/pci.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2015 Cisco and/or its affiliates.
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at:
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * unix/pci.h: Linux specific pci state
- *
- * Copyright (c) 2008 Eliot Dresselhaus
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
- *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#ifndef included_unix_pci_h
-#define included_unix_pci_h
-
-#include <vlib/pci/pci.h>
-
-typedef struct {
-  /* /sys/bus/pci/devices/... directory name for this device. */
-  u8 * dev_dir_name;
-
-  /* Resource file descriptors. */
-  int * resource_fds;
-
-  /* File descriptor for config space read/write. */
-  int config_fd;
-
-  /* PCI bus address for this devices parsed from /sys/bus/pci/devices name. */
-  vlib_pci_addr_t bus_address;
-
-  /* File descriptor for /dev/uio%d */
-  int uio_fd;
-
-  /* Minor device for uio device. */
-  u32 uio_minor;
-
-  /* Index given by unix_file_add. */
-  u32 unix_file_index;
-
-  /* Input node to handle interrupts for this device. */ 
-  u32 device_input_node_index;
-  
-  /* Node runtime will be a bitmap of device indices with pending interrupts. */
-  u32 device_index;
-} linux_pci_device_t;
-
-/* Pool of PCI devices. */
-typedef struct {
-  vlib_main_t * vlib_main;
-  vlib_pci_device_t * pci_devs;
-  linux_pci_device_t * linux_pci_devices;
-  pci_device_registration_t * pci_device_registrations;
-  uword * pci_dev_index_by_pci_addr;
-} linux_pci_main_t;
-
-extern linux_pci_main_t linux_pci_main;
-
-always_inline linux_pci_device_t *
-pci_dev_for_linux (vlib_pci_device_t * dev)
-{
-  linux_pci_main_t * pm = &linux_pci_main;
-  return pool_elt_at_index (pm->linux_pci_devices, dev->os_handle);
-}
-
-/* Call to allocate/initialize the pci subsystem.
-   This is not an init function so that users can explicitly enable
-   pci only when it's needed. */
-clib_error_t * pci_bus_init (vlib_main_t * vm);
-
-clib_error_t * vlib_pci_bind_to_uio (vlib_pci_device_t * d, char * uio_driver_name);
-
-#endif /* included_unix_pci_h */
diff --git a/vlib/vlib/unix/physmem.c b/vlib/vlib/unix/physmem.c
index 185483d..67c2233 100644
--- a/vlib/vlib/unix/physmem.c
+++ b/vlib/vlib/unix/physmem.c
@@ -206,8 +206,6 @@
   vlib_physmem_main_t * vpm = &vm->physmem_main;
   physmem_main_t * pm = &physmem_main;
   clib_error_t * error = 0;
-  char * dev_uio_dma_file = "/dev/uio-dma";
-  int using_fake_memory = 0;
 
   /* Avoid multiple calls. */
   if (vm->os_physmem_alloc_aligned)
@@ -224,63 +222,37 @@
   if (vlib_app_physmem_init (vm, pm, physical_memory_required))
       return 0;
 
-  if (physical_memory_required)
+  if (!pm->no_hugepages && htlb_init(vm))
     {
-      if (!pm->no_hugepages && htlb_init(vm))
-        {
-          fformat(stderr, "%s: use huge pages\n", __FUNCTION__);
-          return 0;
-        }
-      pm->uio_dma_fd = open (dev_uio_dma_file, O_RDWR);
+      fformat(stderr, "%s: use huge pages\n", __FUNCTION__);
+      return 0;
     }
-  else
-    pm->uio_dma_fd = -1;
 
-  if (pm->uio_dma_fd < 0)
+  pm->mem = mmap (0, pm->mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  if (pm->mem == MAP_FAILED)
     {
-      if (physical_memory_required)
-	{
-	  error = clib_error_return_unix (0, "open `%s'", dev_uio_dma_file);
-	  goto done;
-	}
-
-      using_fake_memory = 1;
-      pm->mem = mmap (0, pm->mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-      if (pm->mem == MAP_FAILED)
-	{
-	  error = clib_error_return_unix (0, "mmap");
-	  goto done;
-	}
-
-      pm->heap = mheap_alloc (pm->mem, pm->mem_size);
-
-      /* Identity map with a single page. */
-      vpm->log2_n_bytes_per_page = min_log2 (pm->mem_size);
-      vec_add1 (vpm->page_table, pointer_to_uword (pm->mem));
+      error = clib_error_return_unix (0, "mmap");
+      goto done;
     }
-  else
-    error = clib_error_return (0, "uio_dma deprecated");
+
+  pm->heap = mheap_alloc (pm->mem, pm->mem_size);
+
+  /* Identity map with a single page. */
+  vpm->log2_n_bytes_per_page = min_log2 (pm->mem_size);
+  vec_add1 (vpm->page_table, pointer_to_uword (pm->mem));
 
   vpm->page_mask = pow2_mask (vpm->log2_n_bytes_per_page);
   vpm->virtual.start = pointer_to_uword (pm->mem);
   vpm->virtual.size = pm->mem_size;
   vpm->virtual.end = vpm->virtual.start + vpm->virtual.size;
 
-  if (using_fake_memory)
-      fformat(stderr, "%s: use fake dma pages\n", __FUNCTION__);
-  else
-      fformat(stderr, "%s: use uio dma pages\n", __FUNCTION__);
+  fformat(stderr, "%s: use fake dma pages\n", __FUNCTION__);
 
  done:
   if (error)
     {
       if (pm->mem != MAP_FAILED)
 	munmap (pm->mem, pm->mem_size);
-      if (pm->uio_dma_fd >= 0)
-	{
-	  close (pm->uio_dma_fd);
-	  pm->uio_dma_fd = -1;
-	}
     }
   return error;
 }
@@ -296,7 +268,7 @@
   physmem_main_t * pm = &physmem_main;
 
   if (pm->heap)
-      vlib_cli_output (vm, "%U", format_mheap, pm->heap, /* verbose */ 0);
+      vlib_cli_output (vm, "%U", format_mheap, pm->heap, /* verbose */ 1);
   else
       vlib_cli_output (vm, "No physmem allocated.");
 #endif
diff --git a/vlib/vlib/unix/physmem.h b/vlib/vlib/unix/physmem.h
index a963be7..adbd347 100644
--- a/vlib/vlib/unix/physmem.h
+++ b/vlib/vlib/unix/physmem.h
@@ -36,9 +36,6 @@
 #include <sys/shm.h>
 
 typedef struct {
-  /* File descriptor for /dev/uio-dma. */
-  int uio_dma_fd;
-
   /* Virtual memory via mmaped. */
   void * mem;
 
diff --git a/vlib/vlib/unix/unix.h b/vlib/vlib/unix/unix.h
index 2922b4e..75d5df0 100644
--- a/vlib/vlib/unix/unix.h
+++ b/vlib/vlib/unix/unix.h
@@ -194,10 +194,13 @@
 /* utils */
 
 clib_error_t *
-write_sys_fs (char * file_name, char * fmt, ...);
+vlib_sysfs_write (char * file_name, char * fmt, ...);
 
 clib_error_t *
-read_sys_fs (char * file_name, char * fmt, ...);
+vlib_sysfs_read (char * file_name, char * fmt, ...);
+
+u8 *
+vlib_sysfs_link_to_name(char * link);
 
 clib_error_t *
 foreach_directory_file (char * dir_name,
diff --git a/vlib/vlib/unix/util.c b/vlib/vlib/unix/util.c
index d640360..7a3a2bf 100644
--- a/vlib/vlib/unix/util.c
+++ b/vlib/vlib/unix/util.c
@@ -59,7 +59,6 @@
   d = opendir (dir_name);
   if (! d)
     {
-      /* System has no PCI bus. */
       if (errno == ENOENT)
         return 0;
       return clib_error_return_unix (0, "open `%s'", dir_name);
@@ -101,7 +100,7 @@
 }
 
 clib_error_t *
-write_sys_fs (char * file_name, char * fmt, ...)
+vlib_sysfs_write (char * file_name, char * fmt, ...)
 {
   u8 * s;
   int fd;
@@ -124,7 +123,7 @@
 }
 
 clib_error_t *
-read_sys_fs (char * file_name, char * fmt, ...)
+vlib_sysfs_read (char * file_name, char * fmt, ...)
 {
   unformat_input_t input;
   u8 * s = 0;
@@ -163,3 +162,28 @@
   return 0;
 }
 
+u8 *
+vlib_sysfs_link_to_name(char * link)
+{
+  char *p, buffer[64];
+  unformat_input_t in;
+  u8 *s = 0;
+  int r;
+
+  r = readlink(link, buffer, sizeof(buffer) - 1);
+
+  if (r < 0)
+    return 0;
+
+  buffer[r] = 0;
+  p = strrchr(buffer, '/');
+
+  if (!p)
+    return 0;
+
+  unformat_init_string (&in, p+1, strlen (p+1));
+  unformat(&in, "%s", &s);
+  unformat_free (&in);
+
+  return s;
+}