/* Copyright 2020 The OpenXLA Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ #ifndef XLA_PJRT_GPU_SE_GPU_PJRT_CLIENT_H_ #define XLA_PJRT_GPU_SE_GPU_PJRT_CLIENT_H_ #include #include #include #include #include #include #include #include #include "absl/container/flat_hash_map.h" #include "absl/memory/memory.h" #include "absl/status/statusor.h" #include "absl/strings/string_view.h" #include "absl/time/time.h" #include "absl/types/span.h" #include "mlir/IR/BuiltinOps.h" #include "xla/client/local_client.h" #include "xla/hlo/builder/xla_computation.h" #include "xla/layout.h" #include "xla/pjrt/distributed/key_value_store_interface.h" #include "xla/pjrt/gpu/gpu_topology.h" #include "xla/pjrt/gpu/gpu_topology.pb.h" #include "xla/pjrt/gpu/se_gpu_topology_description.h" #include "xla/pjrt/local_device_state.h" #include "xla/pjrt/pjrt_client.h" #include "xla/pjrt/pjrt_compiler.h" #include "xla/pjrt/pjrt_device_description.h" #include "xla/pjrt/pjrt_executable.h" #include "xla/pjrt/pjrt_future.h" #include "xla/pjrt/pjrt_stream_executor_client.h" #include "xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h" #include "xla/service/computation_placer.h" #include "xla/service/gpu/gpu_executable_run_options.h" #include "xla/shape.h" #include "xla/stream_executor/device_memory_allocator.h" #include "xla/tsl/framework/allocator.h" #include "xla/xla_data.pb.h" #include "tsl/platform/casts.h" namespace xla { using DeviceTopologyPair = std::pair>, GpuTopologyProto>; class StreamExecutorGpuDevice : public PjRtStreamExecutorDevice { public: StreamExecutorGpuDevice(int id, std::unique_ptr local_device_state, std::string device_kind, std::string device_vendor, std::string compute_capability, int core_count, int node_id, int slice_index = 0); int slice_index() const; absl::string_view device_vendor() const; absl::StatusOr GetAllocatorStats() const override; absl::Span coords() const; absl::StatusOr default_memory_space() const override; private: std::string device_vendor_; int slice_index_; }; class StreamExecutorGpuHbmMemorySpace : public PjRtStreamExecutorMemorySpace { public: static constexpr absl::string_view kKind = "device"; static const int kKindId; StreamExecutorGpuHbmMemorySpace(int id, PjRtDevice* device); }; // A custom PjRtClient that overrides the device assignment method. class StreamExecutorGpuClient : public xla::PjRtStreamExecutorClient { public: using xla::PjRtStreamExecutorClient::PjRtStreamExecutorClient; StreamExecutorGpuClient( std::string platform_name, LocalClient* client, std::vector> devices, int process_index, std::unique_ptr allocator, std::unique_ptr host_memory_allocator, bool should_stage_host_to_device_transfers, std::unique_ptr gpu_run_options, std::shared_ptr kv_store, std::shared_ptr distributed_client, bool abort_collectives_on_failure, std::shared_ptr gpu_topology, std::optional num_nodes); std::optional> key_value_store() const override { return kv_store_; } gpu::GpuExecutableRunOptions* gpu_run_options() override; absl::StatusOr GetDefaultDeviceAssignment( int num_replicas, int num_partitions) const override; absl::string_view platform_version() const override; std::optional plugin_attributes() const override; using PjRtStreamExecutorClient::CreateBuffersForAsyncHostToDevice; absl::StatusOr> CreateBuffersForAsyncHostToDevice( absl::Span shape_specs, std::optional>> device_layouts, PjRtMemorySpace* memory_space) override; PjRtFuture<> CopyRawSubBufferToHost(PjRtBuffer* buffer, PjRtFuture dst, int64_t offset, int64_t transfer_size) override; PjRtFuture<> CopyRawHostToDevice( LocalDeviceState* local_device, tsl::RCReference device_buffer, const void* src, int64_t offset, int64_t transfer_size) override; PjRtFuture<> CopyRawDeviceToHost( LocalDeviceState* local_device, tsl::RCReference device_buffer, void* dst, int64_t offset, int64_t transfer_size) override; void CopyToRemoteDevice(PjRtBuffer* buffer, absl::string_view serialized_descriptor, PjRtBuffer::RemoteSendCallback on_done) override; absl::StatusOr>> MakeCrossHostReceiveBuffers(absl::Span shapes, PjRtDevice* device, PjRtCrossHostRecvNotifier notifier) override; absl::StatusOr GetTopologyDescription() const override { return &topology_; } absl::StatusOr GetDefaultLayout( PrimitiveType element_type, absl::Span dims) override; absl::StatusOr> LoadSerialized( absl::string_view serialized, std::optional options, const LoadOptions& load_options); absl::StatusOr> CompileAndLoad( const XlaComputation& computation, CompileOptions options) override; absl::StatusOr> CompileAndLoad( mlir::ModuleOp module, CompileOptions options) override; absl::StatusOr RunAsync( LocalExecutable& exec, PjRtDevice* device, std::vector> arguments, ExecutableRunOptions run_options) override; private: absl::StatusOr> GetLatestIncarnations(); std::optional num_nodes_; xla::StreamExecutorGpuTopologyDescription topology_; std::shared_ptr kv_store_; std::shared_ptr distributed_client_; }; std::vector> BuildLocalDevices( std::map> local_device_states, int node_id); std::string MakeComputeCapabilityString(const se::DeviceDescription* desc); absl::StatusOr BuildDistributedDevices( absl::string_view platform_name, std::map> local_device_states, int node_id, int num_nodes, gpu::GpuExecutableRunOptions* gpu_executable_run_options, std::shared_ptr kv_store, bool enable_mock_nccl, std::optional mock_gpu_topology = std::nullopt, std::optional slice_index = std::nullopt, absl::Duration get_local_topology_timeout = absl::Minutes(2), absl::Duration get_global_topology_timeout = absl::Minutes(5)); absl::StatusOr> GetStreamExecutorGpuClient( const GpuClientOptions& options); // Get the fabric info of a local device ordinal in the format of // "clusterUuid/cliqueId". Empty on SM90 or lower. absl::StatusOr GetDeviceFabricInfo(int device_ordinal); } // namespace xla #endif // XLA_PJRT_GPU_SE_GPU_PJRT_CLIENT_H_