Adriankhl commited on
Commit
ee56a37
·
1 Parent(s): 71850e7

vulkan: select only one device for single gpu with multiple drivers (llama/7582)

Browse files
Files changed (1) hide show
  1. ggml-vulkan.cpp +78 -4
ggml-vulkan.cpp CHANGED
@@ -1,5 +1,5 @@
1
  #include "ggml-vulkan.h"
2
-
3
  #ifdef GGML_VULKAN_RUN_TESTS
4
  #include <chrono>
5
  #endif
@@ -9,12 +9,13 @@
9
  #include <algorithm>
10
  #include <cmath>
11
  #include <iostream>
12
- #include <limits>
13
  #include <tuple>
14
  #include <vector>
15
  #include <sstream>
16
  #include <utility>
17
  #include <memory>
 
 
18
 
19
  #include "ggml.h"
20
  #include "ggml-backend-impl.h"
@@ -1555,8 +1556,10 @@ static void ggml_vk_print_gpu_info(size_t idx) {
1555
  vk::PhysicalDeviceProperties2 props2;
1556
  vk::PhysicalDeviceMaintenance3Properties props3;
1557
  vk::PhysicalDeviceSubgroupProperties subgroup_props;
 
1558
  props2.pNext = &props3;
1559
  props3.pNext = &subgroup_props;
 
1560
  physical_device.getProperties2(&props2);
1561
 
1562
  const size_t subgroup_size = subgroup_props.subgroupSize;
@@ -1600,7 +1603,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
1600
  fp16 = fp16 && vk12_features.shaderFloat16;
1601
 
1602
  std::string device_name = props2.properties.deviceName.data();
1603
- std::cerr << GGML_VK_NAME << idx << ": " << device_name << " | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
1604
 
1605
  if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
1606
  std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl;
@@ -1696,7 +1699,78 @@ void ggml_vk_instance_init() {
1696
  vk::PhysicalDeviceProperties props = devices[i].getProperties();
1697
 
1698
  if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
1699
- vk_instance.device_indices.push_back(i);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1700
  }
1701
  }
1702
 
 
1
  #include "ggml-vulkan.h"
2
+ #include <vulkan/vulkan_core.h>
3
  #ifdef GGML_VULKAN_RUN_TESTS
4
  #include <chrono>
5
  #endif
 
9
  #include <algorithm>
10
  #include <cmath>
11
  #include <iostream>
 
12
  #include <tuple>
13
  #include <vector>
14
  #include <sstream>
15
  #include <utility>
16
  #include <memory>
17
+ #include <limits>
18
+ #include <map>
19
 
20
  #include "ggml.h"
21
  #include "ggml-backend-impl.h"
 
1556
  vk::PhysicalDeviceProperties2 props2;
1557
  vk::PhysicalDeviceMaintenance3Properties props3;
1558
  vk::PhysicalDeviceSubgroupProperties subgroup_props;
1559
+ vk::PhysicalDeviceDriverProperties driver_props;
1560
  props2.pNext = &props3;
1561
  props3.pNext = &subgroup_props;
1562
+ subgroup_props.pNext = &driver_props;
1563
  physical_device.getProperties2(&props2);
1564
 
1565
  const size_t subgroup_size = subgroup_props.subgroupSize;
 
1603
  fp16 = fp16 && vk12_features.shaderFloat16;
1604
 
1605
  std::string device_name = props2.properties.deviceName.data();
1606
+ std::cerr << GGML_VK_NAME << idx << ": " << device_name << " (" << driver_props.driverName << ") | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
1607
 
1608
  if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
1609
  std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl;
 
1699
  vk::PhysicalDeviceProperties props = devices[i].getProperties();
1700
 
1701
  if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
1702
+ // Check if there are two physical devices corresponding to the same GPU
1703
+ auto old_device = std::find_if(
1704
+ vk_instance.device_indices.begin(),
1705
+ vk_instance.device_indices.end(),
1706
+ [&devices, &props](const size_t k){ return devices[k].getProperties().deviceID == props.deviceID; }
1707
+ );
1708
+ if (old_device == vk_instance.device_indices.end()) {
1709
+ vk_instance.device_indices.push_back(i);
1710
+ } else {
1711
+ // There can be two physical devices corresponding to the same GPU if there are 2 different drivers
1712
+ // This can cause error when splitting layers aross the devices, need to keep only 1
1713
+ #ifdef GGML_VULKAN_DEBUG
1714
+ std::cerr << "Device " << i << " and device " << *old_device << " have the same device id" << std::endl;
1715
+ #endif
1716
+
1717
+ vk::PhysicalDeviceProperties2 old_prop;
1718
+ vk::PhysicalDeviceDriverProperties old_driver;
1719
+ old_prop.pNext = &old_driver;
1720
+ devices[*old_device].getProperties2(&old_prop);
1721
+
1722
+ vk::PhysicalDeviceProperties2 new_prop;
1723
+ vk::PhysicalDeviceDriverProperties new_driver;
1724
+ new_prop.pNext = &new_driver;
1725
+ devices[i].getProperties2(&new_prop);
1726
+
1727
+ std::map<vk::DriverId, int> driver_priorities {};
1728
+ int old_priority = std::numeric_limits<int>::max();
1729
+ int new_priority = std::numeric_limits<int>::max();
1730
+
1731
+ // Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id
1732
+ // Smaller number -> higher priority
1733
+ switch (old_prop.properties.vendorID) {
1734
+ case VK_VENDOR_ID_AMD:
1735
+ driver_priorities[vk::DriverId::eMesaRadv] = 1;
1736
+ driver_priorities[vk::DriverId::eAmdOpenSource] = 2;
1737
+ driver_priorities[vk::DriverId::eAmdProprietary] = 3;
1738
+ break;
1739
+ case VK_VENDOR_ID_INTEL:
1740
+ driver_priorities[vk::DriverId::eIntelOpenSourceMESA] = 1;
1741
+ driver_priorities[vk::DriverId::eIntelProprietaryWindows] = 2;
1742
+ break;
1743
+ case VK_VENDOR_ID_NVIDIA:
1744
+ driver_priorities[vk::DriverId::eNvidiaProprietary] = 1;
1745
+ #if defined(VK_API_VERSION_1_3) && VK_HEADER_VERSION >= 235
1746
+ driver_priorities[vk::DriverId::eMesaNvk] = 2;
1747
+ #endif
1748
+ break;
1749
+ }
1750
+
1751
+ if (driver_priorities.count(old_driver.driverID)) {
1752
+ old_priority = driver_priorities[old_driver.driverID];
1753
+ }
1754
+ if (driver_priorities.count(new_driver.driverID)) {
1755
+ new_priority = driver_priorities[new_driver.driverID];
1756
+ }
1757
+
1758
+ if (new_priority < old_priority) {
1759
+ auto r = std::remove(vk_instance.device_indices.begin(), vk_instance.device_indices.end(), *old_device);
1760
+ vk_instance.device_indices.erase(r, vk_instance.device_indices.end());
1761
+ vk_instance.device_indices.push_back(i);
1762
+
1763
+ #ifdef GGML_VULKAN_DEBUG
1764
+ std::cerr << "Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName << std::endl;
1765
+ #endif
1766
+ }
1767
+ #ifdef GGML_VULKAN_DEBUG
1768
+ else {
1769
+ std::cerr << "Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl;
1770
+
1771
+ }
1772
+ #endif
1773
+ }
1774
  }
1775
  }
1776