
    Ϧi?                        S SK r S SKrS SKrS SKrS SKrS SKJr  S SKJr  S SK	J
r
  S SKJr  S SKJr  \R                  R!                  \R                  R#                  \5      5      r\R                  R'                  \S5      /r\R                  R'                  \S5      rS	/r\ R.                  " 5       S
 5       r\ R.                  " 5       S 5       rS r " S S\5      rS rS r " S S\5      r " S S\5      r g)    N)Path)_build)get_cache_manager)	GPUTarget)	GPUDriverincludelibcudac            	      "   [         R                  " S5      n U (       a  U /$ [        R                  " SS/5      R	                  5       nUR                  5        Vs/ s H  nSU;   d  M  UR                  5       S   PM      nnU Vs/ s H"  n[         R                  R                  U5      PM$     nn[         R                  " S5      nU(       al  U(       de  UR                  S5       Vs/ s HI  n[         R                  R                  [         R                  R                  US5      5      (       d  MG  UPMK     nnSnU(       a  US	[        U5      -  -  nUS
-  nO
US-  nUS-  n[        S U 5       5      (       d   U5       eU$ s  snf s  snf s  snf )NTRITON_LIBCUDA_PATHz/sbin/ldconfigz-plibcuda.so.1LD_LIBRARY_PATH:zlibcuda.so cannot found!
z!Possible files are located at %s.z:Please create a symlink of libcuda.so to any of the files.z<Please make sure GPU is set up and then run "/sbin/ldconfig"z- (requires sudo) to refresh the linker cache.c              3      #    U  HB  n[         R                  R                  [         R                  R                  US 5      5      v   MD     g7f)r   N)ospathexistsjoin).0r   s     \/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/triton/backends/nvidia/driver.py	<genexpr>libcuda_dirs.<locals>.<genexpr>'   s/     Sddrww~~bggll4@AAds   A
A)r   getenv
subprocesscheck_outputdecode
splitlinessplitr   dirnamer   r   strany)	env_libcuda_pathlibslinelocslocdirsenv_ld_library_pathdirmsgs	            r   libcuda_dirsr,      s\   yy!67 !!""$4d#;<CCED *.):U):nPT>TDJJL):DU,01DSBGGOOC DD1))$564288=s=PRPWPWP\P\]`bpPqAr=s
&C2SY>>KKMM>>SdSSSXUXXSK V1 ts   
F'F)F&AF0Fc                  $    [         /[        5       Q$ N)libdevice_dirr,        r   library_dirsr2   +   s    +LN++r1   c           	      4   [         R                  " U R                  S5      5      R                  5       n[	        U5      nUR                  U S35      nUc  [        R                  " 5        n[        R                  R                  US5      n[        US5       nUR                  U 5        S S S 5        [        XU[        5       [        [         5      n[        US5       nUR#                  UR%                  5       U S3SS9nS S S 5        S S S 5        SS Kn	U	R(                  R+                  X5      n
U	R(                  R-                  U
5      nU
R.                  R1                  U5        U$ ! , (       d  f       N= f! , (       d  f       N~= f! , (       d  f       N= f)	Nzutf-8z.sozmain.cwrbT)binaryr   )hashlibsha256encode	hexdigestr   get_filetempfileTemporaryDirectoryr   r   r   openwriter   r2   include_dir	librariesputreadimportlib.utilutilspec_from_file_locationmodule_from_specloaderexec_module)srcnamekeycache
cache_pathtmpdirsrc_pathfso	importlibspecmods               r   compile_module_from_srcrV   0   s+   
..G,
-
7
7
9Cc"E4&-J((*fww||FH5Hh$ %YWBb$1"YYqvvxD6dYK
   + >>11$CD
..
)
)$
/CKKC J %$   +*s<   +-F	E'*3F	#E8 F	'
E5	1F	8
F	F		
Fc                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )	CudaUtilsH   c                 n   > [        U S5      (       d  [        [        U ]  U 5      U l        U R                  $ )Ninstance)hasattrsuperrX   __new__r[   )cls	__class__s    r   r^   CudaUtils.__new__J   s-    sJ'' C8=CL||r1   c                 Z   [        [        [        R                  R	                  [
        S5      5      R                  5       S5      nUR                  U l        UR                  U l        UR                  U l	        UR                  U l
        UR                  U l        UR                  U l        g )Nzdriver.c
cuda_utils)rV   r   r   r   r   r    	read_textload_binaryget_device_propertiescuOccupancyMaxActiveClustersset_printf_fifo_sizefill_1d_tma_descriptorfill_2d_tma_descriptor)selfrU   s     r   __init__CudaUtils.__init__O   s~    %d277<<+L&M&W&W&Y[gh??%(%>%>",/,L,L)$'$<$<!&)&@&@#&)&@&@#r1   )rg   ri   rj   rf   re   rh   )__name__
__module____qualname____firstlineno__r^   rl   __static_attributes____classcell__r`   s   @r   rX   rX   H   s    
A Ar1   rX   c                     U S   S:X  a  g0 SS_SS_SS	_S
S_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_U    $ )Nr   *CUdeviceptri1int32_ti8int8_ti16int16_ti32i64int64_tu1uint32_tu8uint8_tu16uint16_tu32u64uint64_tfp16floatbf16fp32f32fp64double	nvTmaDescCUtensorMapr0   tys    r   	ty_to_cppr   ^   s    	!u|ih 	y 	y	
 	y 	j 	i 	z 	z 	z 	 	 	 	w 	  	]!" 	#
 
r1   c                    SR                  S UR                  5        5       5      nS nS nSR                  UR                  5        Vs/ s H  oe" U" U5      5      PM     sn5      nSU-   n[        U5      S:  a)  SSR                  S UR                  5        5       5      -   OSn	/ n
UR                  5        HU  u  pUS   S	:X  a  U
R	                  S
U S35        M%  US:X  a  U
R	                  SU 35        MA  U
R	                  SU 35        MW     UR                  5        Vs/ s H  oU ;  d  M
  UPM     nnS[        U5      S:  a  SU-   OS SSR                  S U 5       5       SSR                  UR                  5        VVs/ s H  u  pU" U5       SU S3PM     snn5       SU SU	 SSR                  UR                  5        VVs/ s H  u  pUS   S	:X  a  SU SU SU SU S3	OSPM!     snn5       SSR                  UR                  5        VVs/ s H  u  pUS:X  a  SU SU S U S!3OSPM     snn5       S"[        U
5      S:  a  SSR                  U
5      -   OS S#3nU$ s  snf s  snf s  snnf s  snnf s  snnf )$Nz, c              3   F   #    U  H  u  p[        U5       S U 3v   M     g7f)z argNr   r   ir   s      r   r    make_launcher.<locals>.<genexpr>x   s#     SARYr]O4s3ARs   !c                 :    U S   S:X  a  gU S:X  a  g[        U 5      $ )Nr   rv   	PyObject*r   r   r   s    r   _extracted_type&make_launcher.<locals>._extracted_typez   s%    a5C<}r1   c                 &    SSSSSSSSSS	S
SS.U    $ )NOrQ   dlbhr   BHIK)r   r   r   longr{   r}   ry   r   r   r   r   r   r0   r   s    r   	format_of make_launcher.<locals>.format_of   s:    
  	r1    	iiiKKOOOOr   c              3   0   #    U  H  u  pS U 3v   M     g7f)z&_argNr0   r   s      r   r   r      s      L:K5:Ks   rv   ptr_infoz.dev_ptrr   z*tma_ptr_arga	  
#include "cuda.h"
#include <stdbool.h>
#include <Python.h>
#include <dlfcn.h>

static inline void gpuAssert(CUresult code, const char *file, int line)
{
   if (code != CUDA_SUCCESS)
   {
      const char* prefix = "Triton Error [CUDA]: ";
      const char* str;
      cuGetErrorString(code, &str);
      char err[1024] = {0};
      strcat(err, prefix);
      strcat(err, str);
      PyGILState_STATE gil_state;
      gil_state = PyGILState_Ensure();
      PyErr_SetString(PyExc_RuntimeError, err);
      PyGILState_Release(gil_state);
   }
}

#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }

typedef CUresult (*cuLaunchKernelEx_t)(const CUlaunchConfig* config, CUfunction f, void** kernelParams, void** extra);

static cuLaunchKernelEx_t getLaunchKernelExHandle() {
  // Open the shared library
  void* handle = dlopen("libcuda.so.1", RTLD_LAZY);
  if (!handle) {
    PyErr_SetString(PyExc_RuntimeError, "Failed to open libcuda.so.1");
    return NULL;
  }
  // Clear any existing error
  dlerror();
  cuLaunchKernelEx_t cuLaunchKernelExHandle = (cuLaunchKernelEx_t)dlsym(handle, "cuLaunchKernelEx");
  // Check for errors
  const char *dlsym_error = dlerror();
  if (dlsym_error) {
    PyErr_SetString(PyExc_RuntimeError, "Failed to retrieve cuLaunchKernelEx from libcuda.so.1");
    return NULL;
  }
  return cuLaunchKernelExHandle;
}

static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, CUstream stream, CUfunction functionz) {
  void *params[] = { c              3   ,   #    U  H
  nS U 3v   M     g7f)z&argNr0   )r   r   s     r   r   r      s      <V4sVs   a   };
  if (gridX*gridY*gridZ > 0) {
    if (num_ctas == 1) {
      CUDA_CHECK(cuLaunchKernel(function, gridX, gridY, gridZ, 32*num_warps, 1, 1, shared_memory, stream, params, 0));
    } else {
      CUlaunchAttribute launchAttr[2];
      launchAttr[0].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
      launchAttr[0].value.clusterDim.x = clusterDimX;
      launchAttr[0].value.clusterDim.y = clusterDimY;
      launchAttr[0].value.clusterDim.z = clusterDimZ;
      launchAttr[1].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
      launchAttr[1].value.clusterSchedulingPolicyPreference = CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
      CUlaunchConfig config;
      config.gridDimX = gridX * clusterDimX;
      config.gridDimY = gridY * clusterDimY;
      config.gridDimZ = gridZ * clusterDimZ;
      config.blockDimX = 32 * num_warps;
      config.blockDimY = 1;
      config.blockDimZ = 1;
      config.sharedMemBytes = shared_memory;
      config.hStream = stream;
      config.attrs = launchAttr;
      config.numAttrs = 2;
      static cuLaunchKernelEx_t cuLaunchKernelExHandle = NULL;
      if (cuLaunchKernelExHandle == NULL) {
        cuLaunchKernelExHandle = getLaunchKernelExHandle();
      }
      CUDA_CHECK(cuLaunchKernelExHandle(&config, function, params, 0));
    }
  }
}

typedef struct _DevicePtrInfo {
    CUdeviceptr dev_ptr;
    bool valid;
} DevicePtrInfo;

static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {
  DevicePtrInfo ptr_info;
  ptr_info.dev_ptr = 0;
  ptr_info.valid = true;
  if (PyLong_Check(obj)) {
    ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(obj);
    return ptr_info;
  }
  if (obj == Py_None) {
    // valid nullptr
    return ptr_info;
  }
  PyObject *ptr = PyObject_GetAttrString(obj, "data_ptr");
  if(ptr){
    PyObject *empty_tuple = PyTuple_New(0);
    PyObject *ret = PyObject_Call(ptr, empty_tuple, NULL);
    Py_DECREF(empty_tuple);
    Py_DECREF(ptr);
    if (!PyLong_Check(ret)) {
      PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int");
      ptr_info.valid = false;
      return ptr_info;
    }
    ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(ret);
    if(!ptr_info.dev_ptr)
      return ptr_info;
    uint64_t dev_ptr;
    int status = cuPointerGetAttribute(&dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr);
    if (status == CUDA_ERROR_INVALID_VALUE) {
        PyErr_Format(PyExc_ValueError,
                     "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx);
        ptr_info.valid = false;
    } else if (status != CUDA_SUCCESS) {
        CUDA_CHECK(status);  // Catch any other cuda API errors
        ptr_info.valid = false;
    }
    ptr_info.dev_ptr = dev_ptr;
    Py_DECREF(ret);  // Thanks ChatGPT!
    return ptr_info;
  }
  PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method");
  ptr_info.valid = false;
  return ptr_info;
}

static inline CUtensorMap* getTmaDesc(PyObject *obj) {
  if (sizeof(CUtensorMap*) != 8) {
    PyErr_SetString(PyExc_SystemError, "getTmaDesc() requires 64-bit compilation");
    return NULL;
  }

  PyObject *method_handle = PyObject_GetAttrString(obj, "tma_desc_cpu_ptr");
  if (!method_handle) {
    PyErr_SetString(PyExc_TypeError, "tma_desc_cpu_ptr() method does not exist");
    return NULL;
  }

  PyObject *empty_tuple = PyTuple_New(0);
  if (!empty_tuple) {
    Py_DECREF(method_handle);
    PyErr_SetString(PyExc_SystemError, "Internal Python error!");
    return NULL;
  }
  PyObject *method_ret = PyObject_Call(method_handle, empty_tuple, NULL);
  Py_DECREF(empty_tuple);
  Py_DECREF(method_handle);
  if (!method_ret) {
    PyErr_SetString(PyExc_SystemError, "Internal Python error!");
    return NULL;
  }

  if (!PyLong_Check(method_ret)) {
    PyErr_SetString(PyExc_TypeError, "tma_desc_cpu_ptr() must return 64-bit int");
    Py_DECREF(method_ret);
    return NULL;
  }

  uint64_t ptr_as_uint = PyLong_AsUnsignedLongLong(method_ret);
  Py_DECREF(method_ret);
  if (!ptr_as_uint) {
    PyErr_SetString(PyExc_ValueError, "received NULL ptr from tma_desc_cpu_ptr()");
    return NULL;
  }
  if (ptr_as_uint % 64 != 0) {
    PyErr_SetString(PyExc_ValueError, "tma_desc_cpu_ptr() must be 64-byte aligned");
    return NULL;
  }

  return (CUtensorMap*)(ptr_as_uint);
}

static void ensureCudaContext() {
  CUcontext pctx;
  CUDA_CHECK(cuCtxGetCurrent(&pctx));
  if (!pctx) {
    // Ensure device context.
    CUdevice device;
    CUDA_CHECK(cuDeviceGet(&device, 0));
    CUDA_CHECK(cuDevicePrimaryCtxRetain(&pctx, device));
    CUDA_CHECK(cuCtxSetCurrent(pctx));
  }
}

static PyObject* launch(PyObject* self, PyObject* args) {
  // ensure cuda context is valid before calling any CUDA APIs, e.g. before getPointer calls cuPointerGetAttributes
  ensureCudaContext();

  int gridX, gridY, gridZ;
  uint64_t _stream;
  uint64_t _function;
  PyObject *launch_enter_hook = NULL;
  PyObject *launch_exit_hook = NULL;
  PyObject *kernel_metadata = NULL;
  PyObject *launch_metadata = NULL;
   z _argz; z
  if(!PyArg_ParseTuple(args, "z", &gridX, &gridY, &gridZ, &_stream, &_function,
                                           &kernel_metadata, &launch_metadata,
                                           &launch_enter_hook, &launch_exit_hook ai  )) {
    return NULL;
  }

  int num_warps, num_ctas, shared_memory, clusterDimX, clusterDimY, clusterDimZ;
  if (!PyArg_ParseTuple(kernel_metadata, "iiiiii", &num_warps, &num_ctas, &shared_memory, &clusterDimX, &clusterDimY, &clusterDimZ)) {
    PyErr_SetString(PyExc_TypeError, "kernel_metadata must be a tuple");
    return NULL;
  }

  // extract launch metadata
  if (launch_enter_hook != Py_None){
    PyObject* args = Py_BuildValue("(O)", launch_metadata);
    PyObject* ret = PyObject_CallObject(launch_enter_hook, args);
    Py_DECREF(args);
    if (!ret)
      return NULL;
  }

  // raise exception asap
  zDevicePtrInfo ptr_infoz = getPointer(_argz); if (!ptr_infoz.valid) return NULL;z;
  zCUtensorMap* tma_ptrz = getTmaDesc(_argz); if (!tma_ptrz) return NULL;z;
  Py_BEGIN_ALLOW_THREADS;
  _launch(gridX, gridY, gridZ, num_warps, num_ctas, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (CUstream)_stream, (CUfunction)_functionaW  );
  Py_END_ALLOW_THREADS;
  if (PyErr_Occurred()) {
    return NULL;
  }

  if(launch_exit_hook != Py_None){
    PyObject* args = Py_BuildValue("(O)", launch_metadata);
    PyObject* ret = PyObject_CallObject(launch_exit_hook, args);
    Py_DECREF(args);
    if (!ret)
      return NULL;

  }

  // return None
  Py_INCREF(Py_None);
  return Py_None;
}

static PyMethodDef ModuleMethods[] = {
  {"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"},
  {NULL, NULL, 0, NULL} // sentinel
};

static struct PyModuleDef ModuleDef = {
  PyModuleDef_HEAD_INIT,
  "__triton_launcher",
  NULL, //documentation
  -1, //size
  ModuleMethods
};

PyMODINIT_FUNC PyInit___triton_launcher(void) {
  PyObject *m = PyModule_Create(&ModuleDef);
  if(m == NULL) {
    return NULL;
  }
  PyModule_AddFunctions(m, ModuleMethods);
  return m;
}
)r   itemsvalueslenappendkeys)	constants	signatureids	arg_declsr   r   r   args_formatformat	args_listinternal_args_listr   paramsrJ   s                 r   make_launcherr   u   s    		SARSSI  ''IDTDTDVWDVb9_R%89DVWXK;&FPST]P^abPbtyy L)//:K LLLhjI"a5C<%%8&<=;%%n5%%QCj1 # #)@)Ai-?a)F@.}\ RU  V_  R`  cd  Rd  ~B  EN  ~N  jl  }m myy <V <<= W>n 88Y__=NO=NEA#$E!B/=NOPQ R  &x (RR[Q\ ]( 77  PY  P_  P_  Pa  b  Pa  GL  GHmopqmrvymy$QC'9!BqcAQRSQTThi  @B  B  Pa  b  c  d d77  GP  GV  GV  GX  Y  GX  ~C}~_aep_p"1#%7s/!N[vxx  GX  Y  Z  [ [S |  @R  |S  VW  |W  TX  [_  [d  [d  ew  [x  Tx  ]_  S` )`AICT Ju X AN P. b Ys$   
I/$	I41I4I9&I?  J$c                        \ rS rSrS rS rSrg)CudaLauncheri  c                   ^ S[        TS5      (       a  TR                  R                  O	[        5       0n[        TS5      (       a  TR                  O	[        5       nU4S jnUR                  5        VVs0 s H  u  pgU" U5      U_M     nnnTR                  R                  5        VVs0 s H  u  pgU" U5      U_M     nnn[        XHU5      m[        TS5      n	U	R                  U l
        g s  snnf s  snnf )Nids_of_const_exprsfnr   c                 |   > [        U [        5      (       a%  TR                  R                  R	                  U 5      $ U $ r.   )
isinstancer!   r   	arg_namesindex)r   rJ   s    r   <lambda>'CudaLauncher.__init__.<locals>.<lambda>  s/    As9K9KCFF,,2215RQRRr1   __triton_launcher)r\   r   
constexprstupler   dictr   r   r   rV   launch)
rk   rJ   metadatar   r   cst_keyrL   valuer   rU   s
    `        r   rl   CudaLauncher.__init__  s    #'#t:L:LSVV%6%6RWRYZ%,S+%>%>CMMDF	R;D??;LM;LZSWS\5(;L	M;>==;N;N;PQ;PZSWS\5(;P	QI#6%c+>?jj	 NQs   5C,*C2c                 (    U R                   " U0 UD6  g r.   r   )rk   argskwargss      r   __call__CudaLauncher.__call__  s    T$V$r1   r   N)rn   ro   rp   rq   rl   r   rr   r0   r1   r   r   r     s    !%r1   r   c                   P   ^  \ rS rSrU 4S jrS rS r\S 5       rS r	S r
SrU =r$ )	
CudaDriveri  c                 V   > [        5       U l        [        U l        [        TU ]  5         g r.   )rX   utilsr   launcher_clsr]   rl   )rk   r`   s    r   rl   CudaDriver.__init__  s    [
(r1   c                 |    U R                  5       nU R                  U5      nUS   S-  US   -   nSn[        SX#5      $ )Nr   
          r
   )get_current_deviceget_device_capabilityr   )rk   device
capability	warp_sizes       r   get_current_targetCudaDriver.get_current_target  sI    ((*//7
]R'*Q-7
	77r1   c                 "    SS K nUR                  $ Nr   )torchr
   )rk   r   s     r   get_device_interfaceCudaDriver.get_device_interface  s    zzr1   c                  |    SS K n U R                  R                  5       =(       a    U R                  R                  S L $ r   )r   r
   is_availableversionhip)r   s    r   	is_activeCudaDriver.is_active  s,    zz&&(Hemm.?.?4.GHr1   c                     SSK Jn  U$ )Nr   )do_bench)triton.testingr   )rk   r   s     r   get_benchmarkerCudaDriver.get_benchmarker  s
    +r1   c                 \    SS K nSnUR                  [        US-  5      UR                  SS9$ )Nr   i      r
   )dtyper   )r   emptyint)rk   r   
cache_sizes      r   get_empty_cache_for_benchmark(CudaDriver.get_empty_cache_for_benchmark  s.    
 '
{{3zQ/uyy{PPr1   )r   r   )rn   ro   rp   rq   rl   r   r   staticmethodr   r   r  rr   rs   rt   s   @r   r   r     s9    
8 I IQ Qr1   r   )!	functoolsr   r7   r   r<   pathlibr   triton.runtime.buildr   triton.runtime.cacher   triton.backends.compilerr   triton.backends.driverr   r   r    realpath__file__r   r@   r/   rA   	lru_cacher,   r2   rV   objectrX   r   r   r   r   r0   r1   r   <module>r     s     	     ' 2 . ,
''//"''**84
5ww||GY/0We,H	  0 , ,0A A,
.wt	%6 % "Q "Qr1   