
    ϦiK                         S SK r S SKrS SKrS SKrS SKJr  S SKJrJrJ	r	  SSK
Jr  SSK
Jr  S rS rSS	 jrSS
 jrSS jr " S S5      r " S S5      rS rSS jrSS jrS r\SS j5       rSS jrg)    N)contextmanager)AnyDictList   )language)runtimec                    SR                  U 5      n SSSSU -   S/n[        R                  " U5      nUR                  [        R
                  R                  5      R                  S5      nU Vs/ s H  n[        U5      PM     nnU$ s  snf )N,
nvidia-smi-i0z--query-gpu=z--format=csv,noheader,nounits)	join
subprocesscheck_outputdecodesysstdoutencodingsplitint)attrscmdoutretxs        M/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/triton/testing.pynvsmir      sz    HHUOEsNU$:<[
\C

!
!#
&C
**SZZ((
)
/
/
4C
3a3q63C
J  s   .Bc                    SS K nUbL  UR                  XR                  XR                  S95      R	                  5       n[        U5      S:X  a  US   nU$ US:X  a  U R	                  5       $ [        X25      " U 5      R                  5       $ )Nr   dtyper   all)torchquantiletensorfloattolistlengetattritem)times	quantilesreturn_moder#   r   s        r   _summarize_statisticsr.      sw    nnULL++L$NOVVXs8q=a&C
e||~5&u-2244    c                 >   SSK nUS;   d   eUR                  R                  UR                  R                  5       5         U " 5         Ub1  U H+  nUR	                  5         UR                  S5        SUl        M-     UR                  R                  SS9nUR                  R                  SS9nUR                  5         [        S5       H
  n	U " 5         M     UR                  5         UR                  R                  5         UR                  U5      S-  n
[        S[        X-  5      5      nUR                  R                  5       nUR                  R                  U5         [        U5       H  n	Ub  U H
  nSUl        M     U " 5         M     SSS5        UR                  R                  5         / nSn[        U5       H  n	UR                  R                  SS9nUR                  R                  SS9nUR                  5         UR!                  5         UR                  5         UR                  R                  5         XR                  U5      U-  /-  nM     [#        UR%                  U5      X45      sSSS5        $ ! , (       d  f       N= f! , (       d  f       g= f)	a  
Benchmark the runtime of the provided function.

:param fn: Function to benchmark
:type fn: Callable
:param rep: Repetition time (in ms)
:type rep: int
:param grad_to_none: Reset the gradient of the provided tensor to None
:type grad_to_none: torch.tensor, optional
:param return_mode: The statistical measure to return. Options are "min", "max", "mean", "median", or "all" Default is "mean".
:type return_mode: str
r   Nminmaxmeanmedianr"   Tenable_timing   r   
   )r#   cudastreamStreamdetach_requires_grad_gradEventrecordrangesynchronizeelapsed_timer3   r   	CUDAGraphgraphreplayr.   r%   )fnrepgrad_to_noner,   r-   r#   r   start_event	end_event_estimate_msn_repeatgr   	n_retriess                  r   do_bench_cudagraphrR       s&    AAAA			5::,,.	/
#!		  & " jj&&T&:JJ$$4$8	qAD 

 !..y9A=q#c/01 JJ  "ZZa 8_+)!% *	 % ! 	

 	y!A*****>K

((t(<I HHJJJ""$,,Y7(BCCC " %U\\#%6	OY 
0	/4 ! 5 
0	/s&    D!J!-I=C%J=
J	J
Jc           	         US;   d   eSSK n[        R                  R                  R	                  5       nU " 5         UR                  5         [        R                  R                  R                  5       nUR                  SS9n	UR                  SS9n
U	R                  5         [        S5       H  nUR                  5         U " 5         M     U
R                  5         UR                  5         U	R                  U
5      S-  n[        S[        X-  5      5      n[        S[        X,-  5      5      n[        U5       Vs/ s H  oR                  SS9PM     n	n[        U5       Vs/ s H  oR                  SS9PM     n
n[        U5       H
  nU " 5         M     [        U5       HQ  nUb  U H
  nSUl        M     UR                  5         X   R                  5         U " 5         X   R                  5         MS     UR                  5         UR                  [!        X5       VVs/ s H  u  nnUR                  U5      PM     snnUR"                  S9n[%        UXE5      $ s  snf s  snf s  snnf )	a  
Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
the 20-th and 80-th performance percentile.

:param fn: Function to benchmark
:type fn: Callable
:param warmup: Warmup time (in ms)
:type warmup: int
:param rep: Repetition time (in ms)
:type rep: int
:param grad_to_none: Reset the gradient of the provided tensor to None
:type grad_to_none: torch.tensor, optional
:param quantiles: Performance percentile to return in addition to the median.
:type quantiles: list[float], optional
:param return_mode: The statistical measure to return. Options are "min", "max", "mean", "median", or "all" Default is "mean".    :type return_mode: str
r1   r   NTr6   r8   r   r    )r#   r	   driveractiveget_device_interfacerC   get_empty_cache_for_benchmarkr@   rA   rB   zero_rD   r3   r   r?   r%   zipr&   r.   )rH   warmuprI   rJ   r,   r-   r#   dicacherK   rL   rM   rN   n_warmuprO   ir   ser+   s                       r   do_benchra   _   s   " AAAA				3	3	5BDNNNN!!??AE (((.Kt,I1X
  NN**959K 1c&./0H1c#+,-H9>xIA88$8/KI7<XG!-IG8_
  8_ #! " 	
  NNLLK8ST8S1!..+8ST\a\g\gLhE 	??- JG( Us   :I!II"
c                    SSK nSSKn[        XR                  5      (       d  UR	                  U 5      n [        XR                  5      (       d  UR	                  U5      nUc  Sn[        U5      (       a  U" U R                  5      OUnUc  Sn[        U5      (       a  U" U R                  5      OUn[        XR                  5      (       aV  U R                  UR                  :X  a  U R                  5       n U R                  5       R                  5       R                  5       n [        XR                  5      (       aV  UR                  UR                  :X  a  UR                  5       nUR                  5       R                  5       R                  5       nU R                  S:  d  UR                  S:  a  UR                  R                  XX#SS9  gUR                  XX#S9(       d  [        U S	U  S
U SU SU S3
5      eg)a  
Asserts that two inputs are close within a certain tolerance.

:param x: The first input.
:type x: scala, list, numpy.ndarray, or torch.Tensor
:param y: The second input.
:type y: scala, list, numpy.ndarray, or torch.Tensor
:param atol: The absolute tolerance. Default value is 1e-2.
:type atol: float, optional
:param rtol: The relative tolerance. Default value is 0.
:type rtol: float, optional
:param err_msg: The error message to use if the assertion fails.
:type err_msg: str
r   Ng{Gz?g        r   T)atolrtol	equal_nan)rc   rd    z is not close to z (atol=z, rtol=))numpyr#   
isinstanceTensorr%   callabler!   bfloat16r&   cpudetachsizetestingassert_allcloseallcloseAssertionError)r   yrc   rd   err_msgnpr#   s          r   assert_closerw      s     a&&LLOa&&LLO|$TNN4=D|$TNN4=D !\\""77enn$	AEEGNN""$!\\""77enn$	AEEGNN""$ 	vvzQVVaZ


""1d"N;;q$;2y!,=aSvWUYTZZ[\]] 3r/   c                   ~    \ rS rSrSr     SS\\   S\\   S\S\\   S\\   S	\S
\\\4   S\S\S\	S\	4S jjr
Srg)	Benchmark   zc
This class is used by the :code:`perf_report` function to generate line plots with a concise API.
Nx_namesx_valsline_arg	line_vals
line_names	plot_nameargsxlabelylabelx_logy_logc                     Xl         X l        Xl        X0l        X@l        XPl        Xl        Xl        Xl        Xl	        X`l
        Xpl        g)a  
Constructor.
x_vals can be a list of scalars or a list of tuples/lists. If x_vals is a list
of scalars and there are multiple x_names, all arguments will have the same value.
If x_vals is a list of tuples/lists, each element should have the same length as
x_names.

:param x_names: Name of the arguments that should appear on the x axis of the plot.
:type x_names: List[str]
:param x_vals: List of values to use for the arguments in :code:`x_names`.
:type x_vals: List[Any]
:param line_arg: Argument name for which different values correspond to different lines in the plot.
:type line_arg: str
:param line_vals: List of values to use for the arguments in :code:`line_arg`.
:type line_vals: List[Any]
:param line_names: Label names for the different lines.
:type line_names: List[str]
:param plot_name: Name of the plot.
:type plot_name: str
:param args: Dictionary of keyword arguments to remain fixed throughout the benchmark.
:type args: Dict[str, Any]
:param xlabel: Label for the x axis of the plot.
:type xlabel: str, optional
:param ylabel: Label for the y axis of the plot.
:type ylabel: str, optional
:param x_log: Whether the x axis should be log scale.
:type x_log: bool, optional
:param y_log: Whether the y axis should be log scale.
:type y_log: bool, optional
:param styles: A list of tuples, where each tuple contains two elements: a color and a linestyle.
:type styles: list[tuple[str, str]]
N)r{   r|   r   r}   r~   r   r   stylesr   r   r   r   )selfr{   r|   r}   r~   r   r   r   r   r   r   r   r   s                r   __init__Benchmark.__init__   sA    ^ 
 "$
"	r/   )r   r}   r   r~   r   r   r   r{   r|   r   r   r   ) r   FFN)__name__
__module____qualname____firstlineno____doc__r   strr   r   boolr   __static_attributes__ r/   r   ry   ry      s     ;c; S	; 	;
 9; I; ; 38n; ; ; ; ; ;r/   ry   c            	       F    \ rS rSrS r  SS\S\S\S\4S jjrSS jr	S	r
g
)Marki  c                     Xl         X l        g NrH   
benchmarks)r   rH   r   s      r   r   Mark.__init__  s    $r/   bench	save_path
show_plots
print_datac           	      	   SS K nSS KJn	  SS Kn
UR                  nUR                   Vs/ s H  o S3PM	     nnUR                   Vs/ s H  o S3PM	     nn[        UR                  5      nU
R                  X-   U-   U-   S9nUR                   GH   n[        U[
        [        45      (       d  U Vs/ s H  nUPM     nn[        U5      [        U5      :w  a  [        S[        U5       SU 35      e[        [        X5      5      n/ / / nnnUR                   HI  nU R                   " S0 UDUR"                  U0DUR$                  DUD6n Uu  pnUU/-  nUU/-  nUU/-  nMK     [        U5      U-   U-   U-   UR(                  [        U5      '   GM     UR*                  (       Ga-  U	R-                  5         U	R/                  5       nUS   n[1        UR                  5       GH  u  nnUUS-      UUS-      pUR2                  (       a  UR2                  U   S   OS nUR2                  (       a  UR2                  U   S   OS nUR5                  UU   UU   UUUS9  UR7                  5       R9                  5       (       a  M  UR7                  5       R9                  5       (       a  M  UR;                  [<        5      nUR;                  [<        5      nUR?                  UU   XS	US
9  GM     URA                  5         URC                  URD                  =(       d    U5        URG                  URH                  5        URK                  URL                  (       a  SOS5        URO                  URP                  (       a  SOS5        U(       a  U	RS                  5         U(       a7  U	RU                  URV                  RY                  X!R*                   S35      5        UXR                  -      nU(       a>  URZ                  S   S:X  a+  UR\                  R_                  5       u  nnUU   UU   -
  US'   U(       a1  [a        UR*                  S-   5        [a        URc                  5       5        U(       a;  URe                  URV                  RY                  X!R*                   S35      SU S3SS9  U$ s  snf s  snf s  snf ! [&         a	    US S pn GNwf = f)Nr   z-minz-max)columnsz	Expected z values, got r   )labelcolorlsg333333?)alphar   loglinearz.png   Diff:z.csvz%.fF)float_formatindexr   )3osmatplotlib.pyplotpyplotpandasr   listr{   	DataFramer|   ri   tupler(   
ValueErrordictrY   r~   rH   r}   r   	TypeErrorlocr   figuresubplot	enumerater   plotisnullr"   astyper&   fill_betweenlegend
set_xlabelr   
set_ylabelr   
set_xscaler   
set_yscaler   showsavefigpathr   shaper   r'   print	to_stringto_csv)r   r   r   r   r   diff_colsave_precisionkwragsr   pltpdy_meanr   y_miny_maxr{   dfrM   x_argsrow_meanrow_minrow_maxrt   r   axfirst_xr^   colstycol0col1s                                  r   _run	Mark._run  s
   '!!%*%5%56%53d%56%*%5%56%53d%56u}}%\\'"2U":U"B\CAa$// '(1Q(1vW% 9S\N-s!KLL#g/*F)+RwgH__ggVV5>>1*=VVvV;+.(F5 VH$E7"E7" % #1g07:WDBFF3r7O' * ???JJLBajG!%"2"231!!f*~r!f*~u,1LLell1oa(d,1LLell1oa(d7RU!33G||~))++ELLN4F4F4H4H!LL/E!LL/EOOBwKTQTOU 4 IIKMM%,,1'2MM%,,'MM5;;%H=MM5;;%H=
BGGLL6Gt4LMN***+q(**,JD$DBtH,BvJ%//C'(",,.!IIbggll90A.FGXZ[iZjjkVl!  #	y 76 ) ! ;+.d5F5;s#   R%R*2R/
R44SSc           	      L   [        U R                  [        5      nU(       a  U R                  /OU R                  n/ nU(       aP  [        R                  " USS9  [        [        R                  R                  US5      S5      n	U	R                  S5        U HN  n
UR                  U R                  " XX40 UD65        U(       d  M/  W	R                  SU
R                   S35        MP     U(       a!  W	R                  S5        U	R                  5         U(       a  U(       a  US	   $ U$ g )
NT)exist_okzresults.htmlwz<html><body>
z<image src="z.png"/>
z</body></html>
r   )ri   r   ry   r   makedirsopenr   r   writeappendr   r   close)r   r   r   r   	return_dfkwargshas_single_benchr   
result_dfshtmlr   s              r   runMark.runb  s    %dooyA*:doo&

KK	D1Y?EDJJ'(Edii*[TZ[\y

]5??*;:FG   JJ)*JJL!!}$!!r/   )r   rH   N)F   )FFr   F)r   r   r   r   r   ry   r   r   r   r   r   r   r/   r   r   r     s>    % chC) C C CSW CJr/   r   c                    ^  U 4S jnU$ )z
Mark a function for benchmarking. The benchmark can then be executed by using the :code:`.run` method on the return value.

:param benchmarks: Benchmarking configurations.
:type benchmarks: List of :class:`Benchmark`
c                    > [        U T5      $ r   )r   r   s    r   <lambda>perf_report.<locals>.<lambda>  s    b*-r/   r   )r   wrappers   ` r   perf_reportr   z  s     .GNr/   c                    SSK nSSKJn  U (       d  UR                  R	                  5       n UR
                  R                  R                  U 5      S   nUR
                  R                  R                  U 5      S   nX4-  S-  S-  S	-  nU$ )
zreturn DRAM bandwidth in GB/s r   Nr   rT   mem_clock_ratemem_bus_widthr   g    .A   )r#   r	   rT   r:   current_devicerU   utilsget_device_properties)devicer#   rT   mem_clock_khz	bus_widthbw_gbpss         r   get_dram_gbpsr    sx    **,MM''==fEFVWM##99&A/RI'!+c1A5GNr/   c                 Z   SS K nSSKJn  U(       d  UR                  R	                  5       nUR
                  R                  R                  U5      S   S-  nUR                  R                  U5      nUS   S:  a  XR                  :X  d   eSnOXR                  UR                  4;   a  SnOtXR                  UR                  UR                  4;   a  SnOKXR                  [        R                   [        R"                  [        R$                  4;   a  S	nO['        S
5      eXQ-  U-  S-  nU$ )Nr   r   r   multiprocessor_count   r      i   i   dtype not supported&.>)r#   r	   rT   r:   r   rU   r   r   get_device_capabilityfloat16float32int32rl   int16int8tl
float8e4nvfloat8e4b15float8e5RuntimeError	r!   
clock_rater   r#   rT   num_subcores
capabilityops_per_sub_coretflopss	            r   get_max_tensorcore_tflopsr    s    **,==&&<<VDE[\_``L11&9J!}q%%%]]EKK00"}}ennekkBB"zz2==".."++NN#455&)99D@FMr/   c                     ^  U 4S jnU$ )Nc                 J   >^  [         R                  " T 5      UU 4S j5       nU$ )Nc                    > SS K nUR                  [        R                  " 5       5      R	                  5       nT
R                  5       UR                  5       :*  nU(       a  US:w  a  [        R                  R                  TR                  S   5      n[        R                  S   SS.nSU;   d   S5       eUS   R                  R                  R                  nU S	TR                   S
U S3n[        R                  " SSSU/SUS9n	U	R                   S:X  d   S5       eS[#        U	R$                  5      ;   d   eg T" U 0 UD6  g )Nr   zcuda-memcheck__file__PATH1)r   PYTORCH_NO_CUDA_MEMORY_CACHINGrequestz@memcheck'ed test must have a (possibly unused) `request` fixturez::[]pytestz-vsT)capture_outputenvz7cuda-memcheck returned an error: bounds checking failedzERROR SUMMARY: 0 errors)psutilProcessr   getppidnameitemsr   realpath__globals__environnodecallspecidr   r   r   
returncoder   r   )r   r   r)  	ppid_namerun_cuda_memcheckr   r(  test_idr   r   target_kwargstest_fns             r   r   1cuda_memcheck.<locals>.decorator.<locals>.wrapper  s!   rzz|499;I - 3 3 5 G Y/%Aww''(;(;J(GH!zz&1UXY F*n,nn* +0099<<b!1!1 2!G9A> nnox%L]agjk~~*e,ee*0C

OCCC((r/   )	functoolswraps)r9  r   r8  s   ` r   	decorator cuda_memcheck.<locals>.decorator  s%    		!	) 
"	)" r/   r   )r8  r=  s   ` r   cuda_memcheckr?    s    , r/   c           	   #     #     [         R                  " / SQ5        [         R                  " SSSSU  SU  3/5        [         R                  " SSSSU SU 3/5        [        S/5      S	   n[        S
/5      S	   n[        X -
  5      S:  d   SU  S35       e[        X1-
  5      S:  d   SU S35       eSU -  nSU-  S-  nXE4v   [         R                  " / SQ5        [         R                  " / SQ5        [         R                  " / SQ5        g ! [         R                  " / SQ5        [         R                  " / SQ5        [         R                  " / SQ5        f = f7f)N)r   r   r   -pmr!  r   r   r   z--lock-gpu-clocks=r   z--lock-memory-clocks=zclocks.current.smr   zclocks.current.memoryr9   zGPU SMs must run at z MHzg 3O?i   gMbP?)r   r   r   rA  r   )r   r   r   z-rgc)r   r   r   z-rmc)r   r   r   abs)ref_sm_clockref_mem_clockcur_sm_clockcur_mem_clockr  gbpss         r   set_gpu_clockrH    sf    C EF a~>	!
 	 	#M?!M?C	!
 	 123A6678;<./"4_8L\NZ^6__4=01B6b:N}o]a8bb6)L8&-l EF AB AB 	 EF AB ABs   EC D A	EA
EEc                    SS K nSSKJn  U(       d  UR                  R	                  5       nUR
                  R                  R                  U5      S   S-  nUR                  R                  5       nUS   S:  a/  XR                  :X  a  SnOXXR                  :X  a  SnOF[        S	5      eXR                  :X  a  SnO)XR                  UR                  4;   a  SnO[        S	5      eXQ-  U-  S
-  nU$ )Nr   r   r   r  r  r       @   r  r	  )r#   r	   rT   r:   r   rU   r   r   r
  r  r  r  rl   r  s	            r   get_max_simd_tflopsrL    s    **,==&&<<VDE[\_``L113J!}qMM!!mm#!455MM!!}}enn55!455&)99D@FMr/   )   NNr4   )   d   NNr4   )NNr   r   )iF  i  )r;  r   r   r   
contextlibr   typingr   r   r   r   r   r  r	   r   r.   rR   ra   rw   ry   r   r   r  r  r?  rH  rL  r   r/   r   <module>rR     s     	  
 % " "  	5<P~?@D0^f@ @F` `F
:6 C C8r/   