-
Notifications
You must be signed in to change notification settings - Fork 1
/
Slides-extra.tex
87 lines (68 loc) · 2.63 KB
/
Slides-extra.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{Implementation Strategy}
\begin{itemize}
\item Create test kernels
\begin{itemize}
\item Start with very simple kernels (e.g. copy buffer, add two buffers)
\item Gradually add more features (e.g. non-sequential memory accesses, vector instructions, etc)
\end{itemize}
\item Suggested implementation order
\begin{itemize}
\item Preparation and packetization first (required for simplest kernels)
\item Then easier features: builtins, memory addressing, scalarization, instantiation
\item More complex features last: control flow, optimizations
\end{itemize}
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{Scalarization Process}
\begin{itemize}
\item Look for vector \varying{varying} instructions such as:
\begin{itemize}
\item Leaves that define vector values, vector stores
\item Vector extractions
\item Vector -> scalar bitcasts
\end{itemize}
\item Recursively scalarize until we reach a scalar value
\begin{itemize}
\item Operands before instructions
\item Re-create instructions for each vector element
\item Vector lane $\neq$ SIMD instance!
\end{itemize}
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\begin{frame}[c]{Scalarization Example: Before}
%
%\center{\includegraphics[scale=0.65]{images/scalarization-start.pdf}}
%
%\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\begin{frame}{Scalarization Example: After}
%
%\center{\includegraphics[scale=0.65]{images/scalarization-end.pdf}}
%
%\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]{Scalarization Example}
After Scalarization:
\begin{codebox}
kernel void extract_lr(int2 *src, int *left, int *right) {
int tid = get_global_id(0);
int sampleLeft = *((int *)&src[tid] + 0);
int sampleRight = *((int *)&src[tid] + 1);
left[tid] = (sampleLeft >> 1);
right[tid] = (sampleRight >> 1);
}
\end{codebox}
After Packetization:
\begin{codebox}
kernel void extract_lr(int2 *src, int *left, int *right) {
int tid = get_global_id(0);
int4 samplesLeft = interleaved_load_int4((int *)&src[tid] + 0, 2);
int4 samplesRight = interleaved_load_int4((int *)&src[tid] + 1, 2);
vstore4(samplesLeft >> 1, tid, (int *)left);
vstore4(samplesRight >> 1, tid, (int *)right);
}
\end{codebox}
\end{frame}