diff --git a/paper/sl.tex b/paper/sl.tex
index aea76c9617c9d53ca8cade9087ad63141bf26b55..03daec5c12812561d09b0fe61ca5e737f3adb179 100755
--- a/paper/sl.tex
+++ b/paper/sl.tex
@@ -288,26 +288,7 @@ This estimate is vital in the employment machine learning and AI systems to ever
 
 \subsection{Causal Modeling}
 
-\begin{figure}
-    \begin{tikzpicture}[->,>=stealth',node distance=1.5cm, semithick]
-
-  \tikzstyle{every state}=[fill=none,draw=black,text=black]
 
-  \node[state] (R)                    {$R$};
-  \node[state] (X) [right of=R] {$X$};
-  \node[state] (T) [below of=X] {$T$};
-  \node[state] (Z) [rectangle, right of=X] {$Z$};
-  \node[state] (Y) [below of=Z] {$Y$};
-
-  \path (R) edge (T)
-        (X) edge (T)
-	     edge (Y)
-        (Z) edge (T)
-	     edge (Y)
-        (T) edge (Y);
-\end{tikzpicture}
-\caption{ $R$ leniency of the decision maker, $T$ is a binary decision,  $Y$ is the outcome that is selectively labled. Background features  $X$ for a subject affect the decision and the outcome. Additional background features  $Z$ are visible only to the decision maker in use. }\label{fig:model}
-\end{figure}
 
 We model the selective labels setting as summarized by Figure~\ref{fig:model}\cite{lakkaraju2017selective}.
 
@@ -324,13 +305,17 @@ We use a propensity score framework to model $X$ and $Z$: they are assumed conti
 %\acomment{We need to start by noting that with a simple example how we assume this to work. If X indicates a safe subject that is jailed, then we know that (I dont know how this applies to other produces) that Z must have indicated a serious risk. This makes $Y=0$ more likely than what regression on $X$ suggests.} done by Riku!
 
 
-\acomment{I do not understand what we are doing from this section. It needs to be described ASAP.}
+%\acomment{I do not understand what we are doing from this section. It needs to be described ASAP.}
+
+
 
 Our approach is based on the fact that in almost all cases, some information regarding the latent variable is recoverable. For illustration, let us consider defendant $i$ who has been given a negative decision $\decisionValue_i = 0$. If the defendant's private features $\featuresValue_i$ would indicate that this subject would be safe to release, we could easily deduce that the unobservable variable $\unobservableValue_i$ indicated high risk since
 %contained so significant information that 
 the defendant had to be jailed. In turn, this makes $Y=0$ more likely than what would have been predicted based on $\featuresValue_i$ alone.
 In an opposite situation,  where the features $\featuresValue_i$ clearly imply that the defendant is dangerous and is subsequently jailed, we do not have that much information available on the latent variable.
 
+\acomment{Could emphasize the above with a plot, x and z in the axis and point styles indicating the decision.}
+
 \acomment{The above assumes that the decision maker in the data is good and not bad.}
 
 
@@ -415,6 +400,27 @@ In practise, once we have used Stan, we have $S$ samples from all of the paramet
 %	\end{itemize}
 %\end{itemize}
 
+\begin{figure}
+    \begin{tikzpicture}[->,>=stealth',node distance=1.5cm, semithick]
+
+  \tikzstyle{every state}=[fill=none,draw=black,text=black]
+
+  \node[state] (R)                    {$R$};
+  \node[state] (X) [right of=R] {$X$};
+  \node[state] (T) [below of=X] {$T$};
+  \node[state] (Z) [rectangle, right of=X] {$Z$};
+  \node[state] (Y) [below of=Z] {$Y$};
+
+  \path (R) edge (T)
+        (X) edge (T)
+	     edge (Y)
+        (Z) edge (T)
+	     edge (Y)
+        (T) edge (Y);
+\end{tikzpicture}
+\caption{ $R$ leniency of the decision maker, $T$ is a binary decision,  $Y$ is the outcome that is selectively labled. Background features  $X$ for a subject affect the decision and the outcome. Additional background features  $Z$ are visible only to the decision maker in use. }\label{fig:model}
+\end{figure}
+
 \begin{algorithm}
 	%\item Potential outcomes / CBI \acomment{Put this in section 3? Algorithm box with these?}
 \DontPrintSemicolon
@@ -437,6 +443,8 @@ Using Stan, draw $S$ samples of the all parameters from the posterior distributi
 
 % If X has multiple dimensions or the relationships between the features and the outcomes are clearly non-linear the presented approach can be extended to accomodate non-lineairty. Jung proposed that... Groups... etc etc.
 
+
+
 \section{Related work}
 
 Discuss this: \cite{DBLP:conf/icml/Kusner0LS19}