
    BjR                     
   d dl Z d dlZd dlmZ d dlZ	 d dlmZ dZn# e	$ r dZdZY nw xY wd dl
mZ de j        fdZde j        fd	Zde j        fd
ZdeddfdZdedefdZde j        fdZde j        fdZde j        fdZdee         fdZ	 	 	 	 d$dedededz  dedz  dedz  dedeeef         fdZ G d d          Z G d d          Z	 d%deez  dee         dz  dee edf         z  fdZ!	 d&d ed!ed"edefd#ZdS )'    N)Any)runtimeTF)_get_device_indexreturnc                     	 dd l } t          j        t          |                     d          d                             }ni# t
          t          f$ rU t          j        dk    r.t          j        dt          j
        j        d          d          }nt          j        d          }Y nw xY w|j        |_        |j        |_        |j        |_        |j        |_        |j        |_        |S )Nr   amdhip64win32	amdhip64_.dllzlibamdhip64.so)rocm_sdkctypesCDLLstrfind_librariesImportError
IndexErrorsysplatformtorchversionhiphipGetErrorStringcuGetErrorStringhipModuleLoadDatacuModuleLoadDatahipModuleGetFunctioncuModuleGetFunctionhipModuleLaunchKernelcuLaunchKernelhipFuncSetAttributecuFuncSetAttribute)r   libs     T/var/www/html/Carbon-Document/venv/lib/python3.11/site-packages/torch/cuda/_utils.py_get_hip_runtime_libraryr$      s    	0k#h55jAA!DEEFF$ 0 0 0<7""+D%-*;A*>DDDEECC+.//C	0 0C0C!6C2C 4CJs   >A A#B'&B'c                  r    t           j        dk    rt          j        d          S t          j        d          S )Nr	   z
nvcuda.dllzlibcuda.so.1)r   r   r   r        r#   _get_cuda_libraryr(   -   s/    
|w{<((({>***r'   c                  \    t           j        j        rt                      S t	                      S N)r   r   r   r$   r(   r&   r'   r#   _get_gpu_runtime_libraryr+   5   s'    } #'))) """r'   resultc                    | dk    rd S t          j                    }t                      }|                    | t          j        |                     |j        |j                                        nd}t          d|           )Nr   Unknown CUDA errorCUDA error: )r   c_char_pr+   r   byrefvaluedecodeRuntimeError)r,   err_strlibcudaerror_messages       r#   _check_cudar8   =   s    {{oG&((GVV\'%:%:;;;")-";AU  5m55
6
66r'   c                 p   t           st          d          | ^}}|t          j        j        k    rVt          j        |          \  }}t          |t                    r|                                }t          d| d| d          t          |          dk    rdS t          |          dk    r|d         S |S )a  Check a cuda.bindings (cuda-python) call result for errors.

    All cuda.bindings runtime calls return ``(error, *outputs)``.  This
    helper unpacks the tuple, raises on non-success, and returns the
    outputs (``None`` for zero outputs, scalar for one, tuple otherwise).
    zcuda.bindings is not availabler/   z ()r   N   )
_HAS_CUDA_BINDINGSr4   _cuda_bindings_runtimecudaError_tcudaSuccesscudaGetErrorString
isinstancebytesr3   len)r,   errout_r5   s        r#   _check_cuda_bindingsrG   I   s      =;<<<IC#!-9	: 	: #5  	7
 gu%% 	'nn&&G;#;;;;;<<<
3xx1}}t
3xx1}}1vJr'   c                     	 dd l } t          j        t          |                     d          d                             }n# t
          t          f$ r t          j        dk    r\d	                    dt          j        j        d         dt          j        j        d         g          }t          j        d| d          }nt          j        d          }Y nw xY w|j        |_        |j        |_        |j        |_        |j        |_        |j        |_        |j        |_        |j        |_        |j        |_        |j        |_        |j        |_         |S )	Nr   hiprtcr	    0   r   zlibhiprtc.so)!r   r   r   r   r   r   r   r   r   joinr   r   r   hiprtcGetErrorStringnvrtcGetErrorStringhiprtcCreateProgramnvrtcCreateProgramhiprtcDestroyProgramnvrtcDestroyProgramhiprtcCompileProgramnvrtcCompileProgramhiprtcGetCodeSizenvrtcGetCUBINSizehiprtcGetCodenvrtcGetCUBINhiprtcGetProgramLogSizenvrtcGetProgramLogSizehiprtcGetProgramLognvrtcGetProgramLoghiprtcAddNameExpressionnvrtcAddNameExpressionhiprtcGetLoweredNamenvrtcGetLoweredName)r   r"   version_strs      r#   _get_hiprtc_libraryrc   f   sA   .k#h55h??BCCDD$ . . .<7""''em'*C1B11EF K +8{88899CC+n--C. "6C 4C!6C!6C1C)C!$!<C 4C!$!<C!6CJs   >A BCCc                  "   t          t          j        j                            d          d                   } t
          j        dk    rd|  dg}nd|  dg}|D ](}	 t          j        |          c S # t          $ r Y %w xY wt          d          )	N.r   r	   nvrtc64_z0_0.dllzlibnvrtc.so.zlibnvrtc.soz Could not find any NVRTC library)
intr   r   cudasplitr   r   r   r   OSError)major_version
nvrtc_libslib_names      r#   _get_nvrtc_libraryrn      s    *0055a899M
|w-}---



 +=**

   	;x((((( 	 	 	H	
4
5
55s   A22
A?>A?c                  \    t           j        j        rt                      S t	                      S r*   )r   r   r   rc   rn   r&   r'   r#   _get_gpu_rtc_libraryrp      s)     } $"$$$!###r'   c                      ddl m} m} dhfd|D             }t          j        j        r|                    |            |S )z
    Get HIPCC/NVCC flags that are compatible with NVRTC compilation.

    Returns:
        List of HIPCC/NVCC flags that can be safely used with NVRTC.
    r   )COMMON_HIPCC_FLAGSCOMMON_NVCC_FLAGSz--expt-relaxed-constexprc                     g | ]}|v|	S r&   r&   ).0flagnvrtc_unsupported_flagss     r#   
<listcomp>z1_get_gpu_rtc_compatible_flags.<locals>.<listcomp>   s+       d:Q.Q.Q.Q.Q.Qr'   )torch.utils.cpp_extensionrr   rs   r   r   r   extend)rr   rs   compatible_flagsrw   s      @r#   _get_gpu_rtc_compatible_flagsr|      s~     POOOOOOO 	#
   *   } 4 2333r'   kernel_sourcekernel_namecompute_capabilitycuda_include_dirsnvcc_optionsauto_pchc           
      	   ddl }t                      ddt          ddffd}|                     d          }|V|j                            |j                                                  }	|j        j        r	|	j	         }n|	j
         |	j         }g }
|j        j        r+|
                    d|                                            n*|
                    d|                                            dd	lm}  |d
          }|D ],}|
                    d|                                            -|r/|D ],}|
                    d|                                            -|rRt          |j        j                  dk     rt!          d|j        j                   |g }|                    d           |r-|D ]*}|
                    |                    d                     +t#                      }|
                    d |D                        t'          |
          }t)          j        |z  |
 }t)          j                    } |                    t)          j        |          || d                                ddd                     |                    d          } |                    ||                                         |||          }|k    rt)          j                    }                    |t)          j        |                     t)          j        |j                  }                    ||           tA          d|j        !                                           t)          j                    } |"                    |t)          j        |                               t)          j        |j                  } |#                    ||                     t)          j                    } |$                    ||t)          j        |                               |j        |j        !                                }nd}%                    t)          j        |                     |j&        |fS )a  
    Compiles a CUDA kernel using NVRTC and returns the PTX code.

    Args:
        kernel_source (str): The CUDA kernel source code as a string
        kernel_name (str): The name of the kernel function to compile
        compute_capability (str, None): The compute capability to target (e.g., "86").
                                           If None, will detect from current device.
        cuda_include_dirs (list, None): List of directories containing CUDA headers
        nvcc_options (list, None): Additional options to pass to NVRTC
        auto_pch (bool): Enable automatic precompiled headers (CUDA 12.8+)

    Returns:
        Tuple[bytes, str]: The compiled PTX code and mangled kernel name
    r   Nr,   r   c                     | k    rot          j                    }                    | t          j        |                     |j        |j                                        nd}t          d|           d S )Nr.   r/   )r   r0   rO   r1   r2   r3   r4   )r,   r5   r7   NVRTC_SUCCESSlibnvrtcs      r#   check_nvrtcz#_nvrtc_compile.<locals>.check_nvrtc   s    ]""o''G((g1F1FGGG =, $$&&&) 
 =m==>>> #"r'   utf-8z--offload-arch=z--gpu-architecture=sm_)include_pathsrh   z-Iz12.8zPCH requires CUDA 12.8+, got z--pchc                 8    g | ]}|                     d           S )r   )encode)ru   rv   s     r#   rx   z"_nvrtc_compile.<locals>.<listcomp>  s$    LLLTDKK((LLLr'   z.cuzKernel compilation failed:
rJ   )'
torch.cudarp   rg   r   rh   get_device_propertiescurrent_devicer   r   gcnArchNamemajorminorappendry   r   r   AssertionErrorr|   rz   rC   r   r0   c_void_prQ   r1   r_   rU   c_size_tr[   create_string_bufferr2   r]   r4   r3   rW   rY   ra   rS   raw)r}   r~   r   r   r   r   r   r   source_bytespropsoptionsr   cuda_include_paths	cuda_path	directoryoptionnvrtc_compatible_flagsnum_optionsoptions_arrayprogc_kernel_namereslog_sizelogbinary_sizebinaryc_mangled_namemangled_namer   r   s                               @@r#   _nvrtc_compiler      s   0  $%%H M	?C 	?D 	? 	? 	? 	? 	? 	? 	? !''00L !
001J1J1L1LMM= 	?$)$5!7$)K!>!>!> G} O=);==DDFFGGGGD0BDDKKMMNNN 877777&v..' 2 2	'I''..001111  6* 	6 	6INN+	++22445555  %u}!""V++ !UAS!U!UVVVLG$$$  3" 	3 	3FNN6==112222:<<NNLL5KLLLMMM g,,K_{2W=M ?DK##L&&((	
 	
	 	 	  &&w//MK//mDDEEE 
&
&t[-
H
HC m?$$''fl8.D.DEEE)(.99##D#...N#):J:J:L:LNNOOO /##KK**4k1J1JKKLLL():;;FK&&tV44555 _&&NK$$T=&,~:V:VWW   '%+2244  d!3!3444 :|##r'   c                   8    e Zd Zdej        ddfdZdeddfdZdS )_CudaModulemoduler   Nc                 "    || _         i | _        d S r*   )_module_kernels)selfr   s     r#   __init__z_CudaModule.__init__J  s    02r'   name_CudaKernelc           	         || j         v r| j         |         S ddlm}  |            }t          j                    }	 t          |                    t          j        |          | j        |	                    d                               t          || j                  }|| j         |<   |S # t          $ r}t          d| d          |d }~ww xY w)Nr   )r+   r   zNo kernel named 'z' in this module)r   torch.cuda._utilsr+   r   r   r8   r   r1   r   r   r   r4   AttributeError)r   r   r+   r6   funckernelrD   s          r#   __getattr__z_CudaModule.__getattr__N  s    4=  =&& 	?>>>>>**,,  	V++L&&dkk'6J6J   
 !t|44F"(DM$M 	V 	V 	V !KT!K!K!KLLRUU	Vs   A.B* *
C4CC)__name__
__module____qualname__r   r   r   r   r   r&   r'   r#   r   r   I  sb        3v 34 3 3 3 3V V V V V V V Vr'   r   c                       e Zd ZdZdej        dej        ddfdZ	 	 	 	 	 dd	eeeef         d
eeeef         de	dz  dede
dz  ddfdZdeddfdZdS )r   zT
    Represents a compiled CUDA kernel that can be called with PyTorch tensors.
    r   r   r   Nc                 0    || _         || _        d| _        d S )Nr   )r   r   _max_shared_mem_bytes)r   r   r   s      r#   r   z_CudaKernel.__init__l  s    	%&"""r'   r;   r;   r;   r   gridblockargs
shared_memstreamc                 t   ddl }|j        j                                        }|sg }g }g }	|D ]l}
t	          |
|j                  r|
j        s*|
j        r|
                                st          d          t          j        |
                                          }|                    |           |	                    t          j        |                     t	          |
t                    r<t          j        |
          }|	                    t          j        |                     t	          |
t"                    r=t          j        |
          }|	                    t          j        |                     Ot'          dt)          |
                     t          j        t+          |	          z              }t-          |	          D ]'\  }}
t          j        |
t          j                  ||<   (|ddl}|j                                        }|dk    rD| j        dk    s|| j        k    r.| j        dk    rdn
d| j         d}t7          d	| d
| d          t9          |                    | j        |d         |d         |d         |d         |d         |d         ||j        |d                     dS )a  
        Call the compiled CUDA kernel

        Args:
            grid (tuple): Grid dimensions (grid_x, grid_y, grid_z)
            block (tuple): Block dimensions (block_x, block_y, block_z)
            args (list): List of arguments to pass to the kernel.
                         PyTorch tensor arguments will be automatically converted to pointers.
            shared_mem (int): Shared memory size in bytes
            stream (torch.cuda.Stream): CUDA stream to use. If None, uses current stream.
        r   Nz?All tensor arguments must be CUDA tensors or pinned CPU tensorszUnsupported argument type:    znot configuredzonly z bytes configuredzKernel requires z' bytes of shared memory (>= 48KB), but ze. Call kernel.set_shared_memory_config(shared_mem) after compilation and before launching the kernel.r;   rL   ) r   rh   _utilsr+   rA   Tensoris_cudais_cpu	is_pinned
ValueErrorr   r   data_ptrr   r1   rg   c_intfloatc_double	TypeErrortyperC   	enumeratecastr   current_streamr   r4   r8   r   r   _as_parameter_)r   r   r   r   r   r   r   r6   processed_argsc_argsargptrr   r   c_args_arrayiconfigured_msgs                    r#   __call__z_CudaKernel.__call__q  s   & 	*#<<>> 	D 13 	K 	KC#u|,, K{ CJ 3==?? $Y   ocllnn55%%c***fl3//0000C%% KS))fl5112222C'' K!?3//fl8445555 Id3ii I IJJJ #f++588'' 	@ 	@FAs$k#v??LOO >Z..00F ""&!++zD<V/V/V -22 ! JT7JJJ 
 3: 3 3%3 3 3   	""	QQQaaa% 	
 	
 	
 	
 	
r'   shared_mem_bytesc                 |   |dk     r	|| _         d S t                      }t          j                                        }t          j        j        r|j        dk    rdnd}nt          |dd          }||k    rt          d| d| d          d	}t          |                    | j        ||                     || _         d S )
Nr   gfx950i   i  shared_memory_per_block_optinzRequested shared memory (z bytes) exceeds device limit (z= bytes). Consider reducing block size or shared memory usage.   )r   r+   r   rh   r   r   r   r   getattrr4   r8   r!   r   )r   r   r6   device_propsmax_shared_mem+cudaFuncAttributeMaxDynamicSharedMemorySizes         r#   set_shared_memory_configz$_CudaKernel.set_shared_memory_config  s   i'')9D&F*,, z7799= 		 &1X==: N %=u N n,,G,< G G!/G G G   783&&	;  	
 	
 	
 &6"""r'   )r   r   Nr   N)r   r   r   __doc__r   r   r   tuplerg   listr   r   r   r&   r'   r#   r   r   g  s         'V_ 'fo '$ ' ' ' ' &/&/ !_
 _
CcM"_
 S#s]#_
 Tk	_

 _
 d
_
 
_
 _
 _
 _
B(6 (6 (6 (6 (6 (6 (6 (6r'   r   ptxkernel_namesc           
         ddl }t                      }t          | t                    r|                     d          } t          j                    }|j                                        }|5  t          |
                    t          j        |          |                      ddd           n# 1 swxY w Y   |st          |          S i }|D ]q}t          j                    }t          |                    t          j        |          ||                    d                               t          ||          ||<   r|S )a,  
    Loads a CUDA module from PTX code and returns a module object that can access kernels.

    Args:
        ptx (bytes or str): The PTX code to load
        kernel_names (list, optional): List of kernel names to extract from the module.
                                      If None, will return a module object with __getattr__.

    Returns:
        object: If kernel_names is None, returns a module object with __getattr__ to access kernels.
               If kernel_names is provided, returns a dict mapping kernel names to _CudaKernel objects.
    r   Nr   )r   r+   rA   r   r   r   r   rh   r   r8   r   r1   r   r   r   )	r   r   r   r6   r   r   kernelsr   r   s	            r#   _cuda_load_moduler     s      '((G #s "jj!! _FZ&&((F	 I IG,,V\&-A-A3GGHHHI I I I I I I I I I I I I I I  #6""" G 2 2  ''T""FDKK,@,@ 	
 	
 	

 $D&11Ns   +6B--B14B1deviceoptional	allow_cpuc                    t          | t                    r| S t          | t                    rt          j        |           } t          | t          j                  r;|r| j        dvrt          d|            n| j        dk    rt          d|            t          j                                        s&t          | t          j	        j                  r| j
        S t          | ||          S )a  Get the device index from :attr:`device`, which can be a torch.device object, a Python integer, or ``None``.

    If :attr:`device` is a torch.device object, returns the device index if it
    is a CUDA device. Note that for a CUDA device without a specified index,
    i.e., ``torch.device('cuda')``, this will return the current default CUDA
    device if :attr:`optional` is ``True``. If :attr:`allow_cpu` is ``True``,
    CPU devices will be accepted and ``-1`` will be returned in this case.

    If :attr:`device` is a Python integer, it is returned as is.

    If :attr:`device` is ``None``, this will return the current default CUDA
    device if :attr:`optional` is ``True``.
    )rh   cpuz(Expected a cuda or cpu device, but got: rh   z!Expected a cuda device, but got: )rA   rg   r   r   r   r   r   jitis_scriptingrh   idx_torch_get_device_index)r   r   r   s      r#   r   r   -  s      &# &# &f%%&%,'' K 	K{/11 !TF!T!TUUU 2[F""IIIJJJ9!!## fej/00 	:"68Y???r'   )NNNFr*   )FF)"r   r   typingr   r   cuda.bindingsr   r=   r<   r   torch._utilsr   r   r   r$   r(   r+   rg   r8   rG   rc   rn   rp   r   r   r|   boolr   rB   r   r   r   dictr   r&   r'   r#   <module>r     sF    



                !
 F E E E E E&+    .+6; + + + +#&+ # # # #	7 	7 	7 	7 	7 	7     :V[    :6FK 6 6 6 6&$fk $ $ $ $tCy    6 &*%) $O$ O$O$O$ d
O$ d{	O$
 +O$ O$ 5#:O$ O$ O$ O$dV V V V V V V V<S6 S6 S6 S6 S6 S6 S6 S6n 8<- -	u-$(I$4-4]*++- - - -b <A@ @@@48@@ @ @ @ @ @s    	))