summaryrefslogtreecommitdiff
path: root/thesis/safety_reset.tex
diff options
context:
space:
mode:
Diffstat (limited to 'thesis/safety_reset.tex')
-rw-r--r--thesis/safety_reset.tex2804
1 files changed, 2804 insertions, 0 deletions
diff --git a/thesis/safety_reset.tex b/thesis/safety_reset.tex
new file mode 100644
index 0000000..765dcad
--- /dev/null
+++ b/thesis/safety_reset.tex
@@ -0,0 +1,2804 @@
+\documentclass[12pt,a4paper,notitlepage]{report}
+\usepackage[ngerman, english]{babel}
+\usepackage[utf8]{inputenc}
+\usepackage[a4paper, top=2cm, bottom=3.5cm, left=3cm, right=4cm]{geometry}
+% Matti remarkable tablet special size
+%\usepackage[paperwidth=15cm, paperheight=244mm, top=1cm, bottom=1cm, left=5mm, right=5mm]{geometry}
+\usepackage[T1]{fontenc}
+\usepackage[
+ backend=biber,
+ style=numeric,
+ natbib=true,
+ url=false,
+ doi=true,
+ eprint=false
+ ]{biblatex}
+\addbibresource{safety_reset.bib}
+\usepackage{amssymb,amsmath}
+\usepackage{listings}
+\usepackage{eurosym}
+\usepackage{wasysym}
+\usepackage{amsthm}
+\usepackage{tabularx}
+\usepackage{multirow}
+\usepackage{multicol}
+\usepackage{tikz}
+\usepackage{mathtools}
+\DeclarePairedDelimiter{\ceil}{\lceil}{\rceil}
+\DeclarePairedDelimiter{\paren}{(}{)}
+
+\usetikzlibrary{arrows}
+\usetikzlibrary{chains}
+\usetikzlibrary{backgrounds}
+\usetikzlibrary{calc}
+\usetikzlibrary{decorations.markings}
+\usetikzlibrary{decorations.pathreplacing}
+\usetikzlibrary{fit}
+\usetikzlibrary{patterns}
+\usetikzlibrary{positioning}
+\usetikzlibrary{shapes}
+
+\usepackage[binary-units]{siunitx}
+\DeclareSIUnit{\baud}{Bd}
+\usepackage{hyperref}
+\usepackage{tabularx}
+\usepackage{commath}
+\usepackage{graphicx,color}
+\usepackage{ccicons}
+\usepackage{subcaption}
+\usepackage{float}
+\usepackage{footmisc}
+\usepackage{array}
+\usepackage[underline=false]{pgf-umlsd}
+\usetikzlibrary{calc}
+%\usepackage[pdftex]{graphicx,color}
+\usepackage{epstopdf}
+\usepackage{pdfpages}
+\usepackage{minted} % pygmentized source code
+% Needed for murks.tex
+\usepackage{setspace}
+\usepackage[draft=false,babel,tracking=true,kerning=true,spacing=true]{microtype} % optischer Randausgleich etc.
+% For german quotation marks
+
+\usepackage{fltpage}
+
+\renewcommand{\floatpagefraction}{.8}
+\newcommand{\degree}{\ensuremath{^\circ}}
+\newcolumntype{P}[1]{>{\centering\arraybackslash}p{#1}}
+
+\usepackage{fancyhdr}
+\fancyhf{}
+\fancyfoot[C]{\thepage}
+\newcommand{\includenotebook}[2]{
+ \fancyhead[C]{Included Jupyter notebook: #1}
+ \includepdf[pages=1,
+ pagecommand={\thispagestyle{fancy}\section{#1}\label{#2_notebook}}
+ ]{resources/#2.pdf}
+ \includepdf[pages=2-,
+ pagecommand={\thispagestyle{fancy}}
+ ]{resources/#2.pdf}
+}
+
+\begin{document}
+\selectlanguage{ngerman}
+\input{murks}
+\titelen{A Post-Attack Recovery Architecture for Smart Electricity Meters}
+\titelde{Eine Architektur zur Kontrollwiederherstellung nach Angriffen auf Smart Metering in Stromnetzen}
+\typ{Masterarbeit}
+\grad{Master of Science (M. Sc.)}
+\autor{Jan Sebastian Götte}
+\gebdatum{\rule{2cm}{12pt}} % Geburtsdatum des Autors
+\gebort{\rule{3cm}{12pt}} % Geburtsort des Autors
+\ifdefined\includeprivatedata
+\input{private-data.tex}{}{}
+\fi
+\gutachter{Prof. Dr. Björn Scheuermann}{Prof. Dr.-Ing. Eckhard Grass}
+\mitverteidigung
+\makeTitel
+\selbstaendigkeitserklaerung{\today}
+\vfill
+\selectlanguage{english}
+{\center{
+\begin{minipage}[t][10cm][b]{\textwidth}
+ \center{\ccbysa}
+
+ \center{This work is licensed under a Creative-Commons ``Attribution-ShareAlike 4.0 International'' license. The
+ full text of the license can be found at:}
+
+ \center{\url{https://creativecommons.org/licenses/by-sa/4.0/}}
+
+ \center{For alternative licensing options, source files, questions or comments please contact the author at
+ \texttt{masterarbeit@jaseg.de}}.
+
+ \center{This is version \texttt{\input{version.tex}\unskip} generated on \today. The git repository can be found at:}
+
+ \center{\url{https://git.jaseg.de/master-thesis.git}}
+\end{minipage}
+}}
+\newpage
+
+% Hier folgt die eigentliche Arbeit (bei doppelseitigem Druck auf einem neuen Blatt):
+\tableofcontents
+\newpage
+
+\chapter{Introduction}
+
+In the power grid, as in many other engineered systems, we can observe an ongoing diffusion of information systems into
+industrial control systems. Automation of these control systems has already been practiced for the better part of a
+century. Throughout the 20th century this automation was mostly limited to core components of the grid. Generators in
+power stations are computer-controlled according to electromechanical and economic models. Switching in substations is
+automated to allow for fast failure recovery. Human operators are still vital to these systems, but their tasks have
+shifted from pure operation to engineering, maintenance and surveillance\cite{crastan03,anderson02}.
+
+With the turn of the century came a large-scale trend in power systems to move from a model of centralized generation,
+built around massive large-scale fossil and nuclear power plants, towards a more heterogenous model of smaller-scale
+generators working together. In this new model large-scale fossil power plants still serve a major role, but two new
+factors come into play. One is the advance of renewable energies. The large-scale use of wind and solar power in
+particular from a current standpoint seems unavoidable for our continued existence on this planet. For the electrical
+grid these systems constitute a significant challenge. Fossil-fueled power plants can be controlled in a precise and
+quick way to match energy consumption. This tracking of consumption with production is vital to the stability of the
+grid. Renewable energies such as wind and solar power do not provide the same degree of controllability, and they
+introduce a larger degree of uncertainty due to the unpredictability of the forces of nature\cite{crastan03}.
+
+Along with this change in dynamic behavior, renewable energies have brought forth the advance of distributed generation.
+In distributed generation end-customers that previously only consumed energy have started to feed energy into the grid
+from small solar installations on their property. Distributed generation is a chance for customers to gain autonomy and
+shift from a purely passive role to being active participants of the electricity market\cite{crastan03}.
+
+To match this new landscape of decentralized generation and unpredictable renewable resources the utility industry has
+had to adapt itself in major ways. One aspect of this adaptation that is particularly visible to ordinary people is the
+computerization of end-user energy metering. Despite the widespread use of industrial control systems inside the
+electrical grid and the far-reaching diffusion of computers into people's everyday lives the energy meter has long been
+one of the last remnants of an offline, analog time. Until the 2010s many households were still served through
+electromechanical Ferraris-style meters that have their origin in the late 19th
+century\cite{borlase01,ukgov04,bnetza02}. Today under the umbrella term \emph{Smart Metering} the shift towards fully
+computerized, often networked meters is well underway. The roll out of these \emph{Smart Meters} has not been very
+smooth overall with some countries severely lagging behind. As a safety-critical technology, smart metering technology
+is usually standardized on a per-country basis. This leads to an inhomogenous landscape with--in some instances--wildly
+incompatible systems. Often vendors only serve a single country or have separate models of a meter for each country.
+This complex standardization landscape and market situation has led to a proliferation of highly complex, custom-coded
+microcontroller firmware. The complexity and scale of this--often network-connected--firmware makes for a ripe substrate
+for bugs to surface.
+
+A remotely exploitable flaw inside a smart meter's firmware\footnote{
+ There are several smart metering architectures that ascribe different roles to the component called \emph{smart
+ meter}. Coarsely divided into two camps these are systems where all metering and communication functions reside
+ within one physical unit and systems where metering and communication functions are separated into two units called
+ the \emph{smart meter} and the \emph{smart meter gateway}\cite{stuber01}. An example for the former are setups in
+ the USA, an example of the latter is the setup in Germany. For clarity, in this introductory chapter we use
+ \emph{smart meter} to describe the entire system at the customer premises including both the meter and a potential
+ gateway.
+} could have consequences ranging from impaired billing functionality to an existential threat to grid
+stability\cite{anderson01,anderson02}. In a country where meters commonly include disconnect switches for purposes such
+as prepaid tariffs a coördinated attack could at worst cause widespread activation of grid safety systems by repeatedly
+connecting and disconnecting megawatts of load capacity in just the wrong moments\cite{wu01}.
+
+Mitigation of these attacks through firmware security measures is unlikely to yield satisfactory results. The enormous
+complexity of smart meter firmware makes firmware security extremely labor-intensive. The diverse standardization
+landscape makes a coördinated, comprehensive response unlikely.
+
+In this thesis, instead of focusing on the very hard task of improving firmware security we introduce a pragmatic
+solution to the--in our opinion likely--scenario of a large-scale compromise of smart meter firmware. In our proposal
+the components of the smart meter that are threatened by remote compromise are equipped with a physically separate
+\emph{safety reset controller} that listens for a reset command transmitted through the electrical grid's frequency and
+on reception forcibly resets the smart meter's entire firmware to a known-good state. Our safety reset controller
+receives commands through Direct Sequence Spread Spectrum (DSSS) modulation carried out on grid frequency through a
+large controllable load such as an aluminum smelter. After forward error correction and cryptographic verification it
+re-flashes the meter's main microcontroller over the standard JTAG interface.
+
+In this thesis, starting from a high level architecture we have carried out extensive simulations of our proposal's
+performance under real-world conditions. Based on these simulations we implemented an end-to-end prototype of our
+proposed safety reset controller as part of a realistic smart meter demonstrator. Finally we experimentally validated
+our results and we will conclude with an outline of further steps towards a practical implementation.
+
+\chapter{Fundamentals}
+
+\section{Structure and operation of the electrical grid}
+
+Since this thesis is filed under \emph{computer science} we will provide a very brief overview of some basic concepts of
+modern power grids.
+
+\subsection{Structure of the electrical grid}
+
+The electrical grid is composed of a large number of systems such as distribution systems, power stations and substations
+interconnected by long transmission lines. Mostly due to ohmic losses\footnote{
+ Power dissipation of a resistor of resistance $R [\Omega]$ given current $I [A]$ is $P_\text{loss} [W] =
+ U_\text{drop} \cdot I = I^2 \cdot R$. Fixing power $P_\text{transmitted} [W] = U_\text{line} \cdot I$ this yields a
+ dependency on line voltage $U_\text{line} [V]$ of $P_\text{loss} =
+ \left(\frac{P_\text{transmitted}}{U_\text{line}}\right)^2 \cdot R$. Thus, ignoring other losses a $2\times$ increase
+ in transmission voltage halves current and cuts ohmic losses to a quarter. In practice the economics are much more
+ complicated due to the cost of better insulation for higher-voltage parts and the cost of power factor compensation.
+}
+the efficiency of transmission of electricity through long transmission lines increases with the square of
+voltage\cite{crastan01,simon01}. % simon01: p. 425, 9.4.1.1, crastan p.55, 3.1
+In practice economic considerations take into account a reduction of the considerable transmission losses (about
+\SI{6}{\percent} in case of Germany\cite{destatis01}) as well as the cost of equipment such as additional transformers
+and the cost increase for the increased voltage rating of components such as transmission lines. Overall these
+considerations have led to a hierarchical structure where large amounts of energy are transmitted over very long
+distances (up to thousands of kilometers) at very high voltages (upwards of \SI{200}{\kilo\volt}) and voltages get lower
+the closer one gets to end-customer premises. In Germany at the local level a substation will distribute
+\SIrange{10}{30}{\kilo\volt} to large industrial consumers and small transformer substations which converting this to
+the \SI{400}{\volt} three-phase AC households are usually hooked up with\cite{crastan01}.
+
+\subsubsection{Transmission lines, bus bars and tie lines}
+
+The number one component of the electrical grid are transmission lines. Short transmission lines that tightly couple
+parts of a substation are called \emph{bus bars}. Transmission lines that couple otherwise independent grid segments are
+called \emph{tie lines}. A tie line often connects grid segments operated by two different operators e.g.\ across a
+country border.
+
+In mathematical analysis \emph{short} transmission lines can be approximated as a simple lumped-component
+RLC\footnote{Resistor-inductor-capacitor.} circuit. In longer lines the effect of wave propagation along the line has to
+be taken into consideration. In the lumped model the transmission line is represented by a circuit of one or two
+inductors, one or two capacitors and some resistors. This representation simplifies analysis. For \emph{long}
+transmission lines above \SI{50}{\kilo\meter} (cable) or \SI{250}{\kilo\meter} (overhead lines) this approximation
+breaks down and wave propagation along the line's length has to be taken into account. The resulting model is what RF
+engineering calls a transmission line and models the line's parasitics\footnote{Stray capacitance, ohmic resistance and
+stray inductance.} as being uniformly distributed along the length of the line. To approximate this model in
+lumped-element evaluations the line is represented as a long chain of small lumped-component RLC sections. This complex
+structure makes simulation and analysis more difficult in comparison to short lines\cite{crastan01}.
+
+Almost all transmission lines used in the transmission and distribution grid use three-phase alternating current (AC).
+Long-distance overland lines are usually implemented as overhead lines due to their low cost and ease of maintenance.
+Underground cables are much more expensive because of their insulation and are only used when overhead lines cannot be
+used for reasons such as safety or aesthetics. In specialized applications such as long, high-power undersea cables
+high-voltage DC (HVDC) is used. In HVDC converter stations at both ends of the line convert between three-phase AC and
+the line's DC voltage. These converter stations are controlled electronically and do not exhibit any of the mechanical
+inertia that is characteristic for rotating generators in a power plant. Since HVDC re-synthesizes three-phase AC from
+DC at the receiving end of the line it can be used to couple non-synchronous grids. This allows for additional degrees
+of control over the transmission of power compared to a regular transmission line. These technical benefits are offset
+by high initial cost (mostly due to the converter stations) leading to HVDC being used in specific situations
+only\cite{crastan03}.
+
+\subsubsection{Generators}
+
+Traditionally all generators in the power grid were synchronous machines. A synchronous machine is a generator whose
+copper coils are wound and connected in such a way that during normal operation its rotation is synchronous with the
+grid frequency. Grid frequency and generator rotation speed are bidirectionally electromechanically coupled. If a
+generator's angle of rotation would lag behind the grid it would receive electrical energy from the grid and convert it
+into mechanical energy, acting as a motor--When the machine leads it acts as a generator and is braked. Small
+deviations between rotational speed and grid frequency will be absorbed by the electromechanical coupling between both.
+Maintaining optimal synchronization over time is the task of complex control systems inside power stations' speed
+governors\cite{simon01,crastan01}.
+
+Nowadays besides traditional rotating generators the grid also contains a large amount of electronically controlled
+inverters. These inverters are used in photovoltaic installations and other setups where either DC or non-synchronous AC
+is to be fed into the grid. Setups like these behave differently to rotating generators. In particular \emph{inertia} in
+these setups is either absent or a software parameter. This potentially reduces their overload capacity compared to
+rotating generators. The fundamentally different nature of electronically controlled inverters has to be taken into
+account in planning and regulation\cite{crastan03}.
+
+\subsubsection{Switchgear}
+
+In the electrical grid switches perform various roles. The ones a computer scientist would recognize are used for
+routing electricity between transmission lines and transformers and can be classified into ones that can be switched
+under load (called load switches) and ones that can not (called disconnectors). The latter are used to ensure parts of
+the network are free from voltage e.g.\ during maintenance. The former are used to re-route flows of electrical
+currents. A major difference in their construction is that in contrast to disconnectors load switches have built-in
+components that extinguish the high-power arc discharge that forms when the circuit is interrupted under load\footnote{
+ While an arc discharge is considered a fault condition in most low-voltage systems including computers, in energy
+ systems it is often part of normal operation.
+}. Beyond this there are circuit breakers. Circuit breakers are safety devices that even under failure conditions can
+still switch at several times the circuit's nominal current. They are activated automatically on conditions such as
+overcurrent or overvoltage. Finally, fuses can be considered non-resettable switches. The fuse in a computer power
+supply is barely more than a glass tube with some wire in it that is designed to melt at the designated current. In
+energy systems fuses are often much more complex devices that in some cases utilize explosives to quickly and decisively
+open the circuit and extinguish the resulting arc discharge\cite{nelles01,crastan01,simon01}.
+% disconnect switches, fuses, breakers -> crastan 1 (ch. 8)
+
+\subsubsection{Transformers}
+
+Along with transmission lines transformers are one of the main components most people will be thinking of when talking
+about the electrical grid. Transformers connect grid segments at different voltage levels with one another. In the
+distribution grid transformers are used to provide standard end-user voltage levels to the customer (e.g. 230/400V in
+Europe) from a \SIrange{10}{25}{\kilo\volt} feeder. In places that use overhead wiring to connect customer households
+this is the role of the pole-mounted gray devices the size of a small refrigerator that are characteristic for these
+systems. Transformers can also be used to convert between buses without a fourth neutral conductor and buses with one.
+
+Transformers are large and heavy devices consisting of thick copper wire or copper foil windings arranged around a core
+made from thin stacked, insulated iron sheets. The entire core sits within a large metal enclosure that is filled with
+liquid (usually a specialized oil) for both cooling and electrical insulation. This cooling liquid is cooled by radiator
+fins on the transformer enclosure itself or an external heat exchanger. Depending on the design cooling may rely on
+natural convection within the cooling liquid or on electrical pumps\cite{crastan01,simon01}.
+
+Transformers come in a large variety of coil and wiring configurations. There exist autotransformers where the secondary
+is part of the primary (or vice-versa) that are used to translate between voltage levels without galvanic isolation at
+lower cost. Transformers used in parts of the electrical grid often have several taps and include \emph{tap changers}. A
+tap changer is a system of mechanical switches that can be used to switch between several discrete transformer ratios to
+adjust secondary voltage under load\cite{simon01}. Tap changers are used in the distribution grid to maintain the
+specified voltage tolerances at the customer's connection.
+
+\subsubsection{Instrument transformers}
+
+While operating on the exact same physical principles instrument transformers are very different from regular
+transformers in an energy system. Instrument transformers are specialized low-power transformers that are used as
+transducers to measure voltage or current at very high voltages. They are part of the control and protection systems of
+substations\cite{crastan01}.
+
+\subsubsection{Chokes}
+
+Chokes are large inductors. In power grid applications their construction is similar to the construction of a
+transformer with the exception that they only have a single winding on the core. They are used for a variety of
+purposes. A frequent use is as a series inductor on one of the phases or the neutral connection to limit transient fault
+currents. In addition to this inductors are also used to tune LC circuits. One such use are Petersen coils, large
+inductors in series with the earth connection at a transformer's star point that are used to quickly extinguish arcs
+between phase and ground on a transmission line. The Petersen coil forms a parrallel LC resonant circuit with the
+transmission line's earth capacitance. Tuning this circuit through adjusting the Petersen coil reduces earth fault
+current to a level low enough to quickly extinguish the arc\cite{simon01}.
+
+\subsubsection{Power factor correction}
+
+Power factor is a power engineering term that is used to describe how close the current waveform of a load is to that of
+a purely resistive load. Given sinusoidal input voltage $V(t) = V_\text{pk} \sin \paren{\omega_\text{nom} t}$ with
+$\omega_\text{nom} = 2 \pi f_\text{nom} = 2 \pi \cdot \SI{50}{\hertz}$ being the nominal angular frequency, the current
+waveform of a resistor with resistance $R \left[\Omega\right]$ according to Ohm's law would be $I(t) = \frac{V(t)}{R} =
+\frac{1}{R} V_\text{pk} \sin\paren{\omega_\text{nom} t}$. In this case voltage and current are perfectly in phase, i.e.
+the current at time $t$ is linear in voltage at constant factor $\frac{1}{R}$.
+
+In contrast to this idealized scenario reality provides us with two common issues: One, the load may be reactive. This
+means its current waveform is an ideal sinusoid, but there is a phase difference between mains voltage and load current
+like so: $I(t) = \frac{V(t)}{R} = \frac{1}{\left|Z\right|} V_\text{pk} \sin\paren{\omega_\text{nom} t + \varphi}$. $Z$
+is the load's complex impedance combining inductive, capacitive and resistive components and $\varphi$ is the phase
+difference between the resulting current waveform and the mains voltage waveform. Examples of such loads are motors and
+the inductive ballasts in old fluorescent lighting fixtures.
+
+The second potential issue are loads with a non-sinusoidal current waveform. There are many classes of these but the
+most common one are the switching-mode power supplies (SMPS) used in most modern electronic devices.. Most SMPS have an
+input stage consisting of a bridge rectifier followed by a capacitor that provide high-voltage DC power to the following
+switch-mode convert circuit. This rectifier-capacitor input stage under normal load draws a high current only at the
+very peak of the input voltage sinusoid and draws almost zero current for most of the period.
+
+These two cases are measured by \emph{displacement power factor} and \emph{distortion power factor} that when combined
+yield the overall true power factor. The power factor is a key quantity in the design and operation of the power grid.
+As a variable in the operation of electrical grids it is also referred to as \emph{VAR} after its is unit Volt-Ampère
+Reactive. A high power factor (close to $1.0$, i.e.\ an in-phase sinusoidal current waveform) yields lowest
+transmission and generation losses. If reactive power generation and consumption are mismatched and power factor is
+low, high currents develop that lead to high transmission losses. For this reason grids include circuits to compensate
+reactive power imbalances\cite{crastan01}. These circuits can be as simple as inductors or capacitors connected to a
+power line but often can be switched to adapt to changing load conditions. Static var compensators are particularly
+fast-acting reactive power compensation devices whose purpose is to maintain a constant bus voltage\cite{rogers01}.
+
+\subsubsection{Loads}
+
+Lastly, there is the loads that the electrical grid serves. Loads range from mains-powered indicator lights in devices
+such as light switches or power strips weighing in at mere Milliwatts to large smelters in industrial metal production
+that can consume a fraction of a gigawatt all on their own.
+
+\subsection{Operational concerns}
+\subsubsection{Modelling the electrical grid}
+
+Modelling performs an important role in the engineering of a reliable power infrastructure. The grid is a complex,
+highly dynamic system. To maintain operational parameters such as voltage, grid frequency and currents inside their
+specified ranges complex control systems are necessary. To design and parametrize such control systems simulations are a
+valuable tool. Using model calculations the effects of control systems on operational variables such as transmission
+efficiency or generation losses can be estimated. Model simulations can be used to identify structural issues such as
+potential points of congestion. The same models can then be used to engineer solutions to such issues, e.g.\ by
+simulating the effect of a new transmission line.
+
+There are several aspects under which the grid or parts of the grid can be simulated. There are static analysis methods
+such as modal analysis that yield information on problematic electromechanical oscillations by computing the eigenvalues
+of a large system of differential equations describing the collective behavior of all components of the grid. Modal
+analysis is one example of simulations used in grid planning. Modal analysis is used in decisions to install additional
+stabilization systems in a particular location. In contrast to static analysis, transient simulations calculate an
+approximation of the time-domain behavior of some variable of interest under a given model. Transient simulations are
+used e.g.\ in the design of control systems. Finally, power flow equations describe the flow of electrical energy
+throughout the network from generator to load. Numerical solutions these equations are used to optimize control
+parameters to increase overall efficiency.
+
+% TODO decide what of this to keep.
+% \subsubsection{Generator controls}
+% \subsubsection{Load shedding}
+% \subsubsection{System stability}
+% \subsubsection{Power System Stabilizers}
+
+\section{Smart meter technology}
+
+Smart meters were a concept pushed by utility companies throughout the early 21st century. Smart metering is one component of the
+larger societal shift towards digitally interconnected technology. Old analog meters required that service personnel
+physically come to read the meter. \emph{Smart} meters automatically transmit their readings through modern
+technologies. Utility companies were very interested in this move not only because of the cost savings for meter reading
+personnel: An always-connected meter also allows several entirely new use cases that have not been possible before. One
+often-cited one is utilizing the new high-resolution load data to improve load forecasting to allow for greater
+generation efficiency. Computerizing the meter also allows for new fee models where electricity cost is no longer fixed
+over time but adapts to market conditions. Models such as prepayment electricity plans where the customer is
+automatically disconnected until they pay their bill are significantly aided by a fully electronic system that can be
+controlled and monitored remotely\cite{anderson02}. A remotely controllable disconnect switch can also be used to coerce
+customers in situations where that was not previously economically possible\footnote{
+ The Swiss association of electrical utility companies in Section 7.2 Paragraph (2)a of their 2010 white paper on the
+ introduction of smart metering\cite{vseaes01} cynically writes that remotely controllable disconnect switches ``lead
+ a new tenant to swiftly register'' with the utility company. This white paper completely vanished from their website
+ some time after publication, but the internet archive has a copy.
+}. Figure \ref{fig_smgw_schema} shows a schema of a smart metering installation in a typical household\cite{stuber01}.
+
+\begin{figure}
+ \centering
+ \includegraphics[width=\textwidth]{resources/smgw_usage_scenario}
+ \vspace*{1cm}
+ \caption{A typical usage scenario of a smart metering system in a typical home. This diagram shows a gateway
+ connected to multiple smart meters through its local metrological network (LMN) and a multitude of devices on the
+ customer's home area network (HAN). A solar inverter and an electric car are connected through a controllable local
+ systems (CLS) adaptor.}
+ \label{fig_smgw_schema}
+\end{figure}
+
+To the customer the utility of a smart meter is largely limited to the convenience of being able to read it without
+going to their basement. In the long term it is said that there will be second-order savings to the customer since
+electricity prices adapting to the market situation along with this convenience will lead them to consume less
+electricity and to consume it in a way that is more amenable to utilities, both leading to reduced
+cost\cite{borlase01,bmwi03,anderson02}.
+
+Traditional Ferraris counters with their distinctive rotating aluminum disc are simple electromechanical devices. Since
+they do not include any semiconductors or other high technology that might be prone to failure a cheap Ferraris-style
+meter can last decades. In contrast to this, smart meters are complex high technology. They are vastly more expensive to
+develop in the first place since they require the development and integration of large amounts of complex, custom
+firmware. Once deployed, their lifetime is limited by this complexity. Complex semiconductor devices tend to fail, and
+firmware that needs to communicate with the outside world tends to not age well\cite{borkar01}. This combination of
+higher unit cost and lower expected lifetime leads to increased costs per household. This cost is usually shared between
+utility and customer.
+
+As part of its smart metering rollout the German government in 2013 had a study conducted on the economies of smart
+meter installations. This study came to the conclusion that for the majority of households computerizing an existing
+Ferraris meter is uneconomical. For larger consumers or new installations the higher cost of installation over time is
+expected to be offset by the resulting savings in electricity cost\cite{bmwi03}.
+
+\subsection{Smart metering and Human-Computer Interaction}
+
+A fundamental aspect in realizing many of the cost and energy savings promised by the smart metering revolution is that
+it requires a paradigm shift in consumer interaction. Previously most consumers would only confront their energy use
+when they receive their monthly or yearly electricity bill. A large part of the cost savings smart meters promise over
+traditional metering infrastructure\footnote{ We are excluding savings from Demand-Side Response (DSR) implemented
+through smart meters here: Traditional ripple control systems already allowed for these\cite{dzung01}, and due to the
+added cost of high-power relays many smart meters do not include such features. } critically depend on the consumer
+regularly interacting with the meter through an in-home display or app, then changing their behavior. We live in an era
+where our attention is already highly contested. A myriad of apps and platforms compete for our attention through our
+smart phones and other devices. Introducing an entirely new service exerting cognitive pressure into this already
+complex battleground is a large endeavour. On the one hand it is not clear how this new service would compete with
+everything else. On the other hand if it does manage to capture our attention and lead us to modify our behavior, what
+are the side effects? For instance an in-home display might increase financial anxiety in economically disadvantaged
+customers.
+
+Human Computer Interaction research has touched the topic of smart metering several times and has many insights to offer
+for technologists\cite{pierce01,rodden01,lupton01,costanza01,fell01}. An issue pointed out in \cite{rodden01} is that at
+least in some countries consumers fundamentally distrust their utility companies. This trust issue is exacerbated by
+smart meters being unilaterally forced onto consumers by utility companies. Much of the success of smart metering's
+ubiquitous promises of energy savings depends on consumer coöperation. Here, the aforementioned trust issue calls into
+question smart metering's chances of long-term success.
+
+As \cite{pierce01} pointed out smart metering developments could benefit greatly from early involvement of HCI research.
+A systematic analysis of non-technical aspects can prevent issues such as privacy implications initially being
+overlooked in the dutch deployment\cite{cuijpers01}. It is not clear that current standardization practice encompasses
+an in-depth consideration of the role of consumers in the socio-technological environment posed by this new technology.
+Standardization is often narrowly focused on technological aspects with little input beyond the occassional public
+consultation at the time the new standards are being implemented into law. This corporate-driven approach to
+technological progress being forced through national standardization bears a risk of failing to meet its advertised
+consumer benefits.
+
+\subsection{Common components}
+\label{sm-cpu}
+
+Smart meters usually are built around an off-the-shelf microcontroller (microcontroller unit, MCU). Some meters use
+specialized smart metering system-on-chips (SoCs)\cite{ifixit01} while others use standard microcontrollers with core
+metering functions implemented in external circuitry (cf.\ Section \ref{sec-easymeter} where we detail the meter in our
+demonstration setup). Specialized SoCs usually contain a segment LCD driver along with some high-resolution
+analog-to-digital converters for the actual measurement functions. In many smart meter designs the metering SoC is
+connected to another full-featured SoC acting as the modem. At a casual glance this might seem to be a security measure,
+but it is be more likely that this is done to ease integration of one metering platform with several different
+communication stacks (e.g.\ proprietary sub-gigahertz wireless, power line communication (PLC) or Ethernet). In these
+architectures there is a clear line of functional demarcation between the metering SoC and the modem. As evidenced by
+over-the-air software update functionality (see e.g.\ \cite{honeywell01}) this does not however extend to an actual
+security boundary.
+
+Energy usage is calculated by measuring both voltage and current at high resolution and then integrating the
+measurements. Current measurements are usually made with either a current transformer or a shunt in a four-wire
+configuration. Voltage is measured by dividing input AC down with a resistor chain. Both are integrated digitally using
+the MCU's time base as a reference.
+
+Whereas legacy electromechanical energy meters only provided a display of aggregate energy use through a decimal counter
+as well as an indirect indication of power through a rotating wheel one of the selling points of smart meters is their
+ability to calculate advanced statistics on energy use. These statistics are supposed to help customers better target
+energy conservation measures\cite{bmwi03}.
+
+Smart meters can perform additional functions in addition to pure measurement and data aggregation. One is to serve as a
+gateway between the utility company's control systems and large controllable loads in the consumer's household for
+Demand-Side Management (DSM)\cite{borlase01}. In DSM the utility company can control when exactly a high-power device
+such as a water storage heater is switched on. To the customer the precise timing does not matter since the storage
+heater is set so that it has enough hot water in its reservoir at all times. The utility company however can use this
+degree of control to reduce load variations during peak times. The efficiency gains realized with this system translate
+into lower electricity prices for DSM-enabled loads for the customer. Traditionally DSM was realized on a local level
+using ripple control systems. In ripple control control data is coded by modulating a carrier at a low frequency such as
+\SI{400}{\hertz} on top of the regular mains voltage. These systems require high-power transmitters at tens of kilowatts
+and still can only bridge regional distances\cite{dzung01}.
+
+Another important additional function is that some smart meters can be used to remotely disconnect consumer households
+with outstanding bills. Using euphemisms such as \emph{utility revenue protection}\cite{kamstrup01} or \emph{reducing
+nontechnical losses}\cite{brown01} while cynically claiming \emph{Consumer Empowerment}\cite{kamstrup01} these systems
+allow an utility company to remotely disconnect a customer at any time\cite{anderson01}. Whereas before smart metering
+this required either additional hardware or an expensive site visit by a qualified technician smart meters have ushered
+in an era of frictionless control\footnote{ Note that in some countries such as the UK non-networked mechanical
+prepayment meters did exist. In such systems the user inserts coins into a coin slot that activates a disconnect switch
+at the household's main electricity connection. These systems were non-networked and did not allow for remote control.
+A disadvantage of such systems compared to modern \emph{smart} systems are the high cost of the coin acceptor and the
+overhead of site visits required to empty the coin box\cite{anderson02}. }.
+
+\subsection{Cryptographic coprocessors}
+
+Just like in legacy electricity meters in smart meters physical security is still a key component of the overall system
+design. Since in both types of meter cost depends on physical quantities being measured at the customer premises
+customers can save cost in case they are able to falsify the meter's measurements without being
+detected\cite{anderson02}. For this reason both types of meters employ countermeasures against physical intrusion.
+Compared to high-risk devices such as card payment processing terminals or ATMs the tamper proofing used in smart meters
+is only basic\cite{anderson02}. Common measures include sealing the case by irreversibly ultrasonically welding the
+front and back plastic shells together or the use of security seals on the lid covering the input and output screw
+terminals. The common low-tech attack of using magnets to saturate the current transformer's ferrite cores is detected
+using hall sensors\cite{anderson02,anderson03,itron01,hager01,easymeter01}. German smart metering standards specify the
+use of a smartcard-like security module to provide transport encryption and other cryptographic
+services\cite{bsi-tr-03109-2,bsi-tr-03109-2-a}. During our literature review we did not find many references to similar
+requirements in other national standards, though this does not mean that individual manufacturers do not use smartcards
+for engineering reasons or due to pressure from utilities. The limited documentation on meter internals that we did find
+such as \cite{ifixit01,bigclive01,eevblog01} suggests where no such regulation exists manufacturers and utilities likely
+choose to forego such advanced measures and instead settle on simple software implementations.
+
+\subsection{Physical structure and installation}
+
+Smart meters are installed like traditional electricity meters. In Japan this means they are usually installed on an
+exterior wall and need to be resistant against weather and extreme environmental conditions (direct sunlight, high
+temperature, high humidity). In Germany the meter is always installed either indoors or in an outdoor utility closet
+that is sealed to keep out the weather. In most countries the meter is connected through large integrated screw
+terminals. In the US meters compliant with the domestic ANSI C12 standard are round and plug into a large socket that is
+wired into the house or apartment's electrical connection.
+
+Modern smart meters are usually made with plastic cases. Ferraris meters often used cases stamped from sheet metal with
+glass windows on them. Smart meters now look much more like other modern electronic devices. A common construction style
+is to separate the case into front and back halves with both clipped or ultrasonically welded together. Ultrasonic
+welding gives a robust, airtight interface that cannot easily be separated and reconnected without leaving visible
+traces, which helps with tamper evidence properties. As an industry-standard process common in various consumer goods
+ultrasonic welding is a cheap and accessible technology\cite{easymeter01,ifixit01}.
+
+Communication interfaces sometimes are brought out through regular electromechanical connectors but often also are
+optical interfaces. A popular style here is to use a regular UART connected to an LED/phototransistor optocoupler
+mounted on the side of the case. The user interface is usually limited to an LCD display. For cost and ingress
+protection smart meters rarely use mechanical buttons. Some smart meters use a phototransistor mounted behind the
+faceplate that can be activated with a flashlight as a crude contact-less input device\cite{easymeter01}.
+
+All meters provide several options for security seals to be installed to detect opening of the meter or access to its
+terminal block. The shape and type of these security seals varies. Factory-installed seals are used to detect tampering
+of the meter itself while seals made by the utility during meter installation are used to guard the meter's terminal
+block and detect attempts at by-passing\cite{czechowski01}.
+
+\section{Regulatory frameworks around the world}
+
+Smart metering regulation varies from country to country as it is tightly coupled to the overall regulation of the
+electrical grid. The standardization of the physical form factor and metrological parameters of a meter is usually
+separate from the standardization of its \emph{smart} functionality. Most countries base the standard for their meters'
+outwards-facing communication interface on a family of standards unified under the IEC as DLMS/COSEM. Employing this
+base protocol ountry-specific standardization only covers which precise variant of it is spoken and what features are
+supported.
+
+\subsection{International standards}
+
+The family of standards one encounters most in smart metering applications are IEC 62056 specifying the Device Language
+Message Specification (DLMS) and the Companion Specification for Electronic Metering (COSEM). DLMS/COSEM are
+application-layer standards describing a request/response schema similar to HTTP. DLMS/COSEM are mapped onto a
+multitude of wire protocols. They can be spoken over TCP/IP or mapped onto low-speed UART serial interfaces
+\cite{sato01,stuber01}. Besides DLMS/COSEM there are a multitude of standards usually specifying how DLMS/COSEM are to
+be applied.
+
+DLMS/COSEM show some amount of feature creep. They do not adhere to the age-old systems design adage that a tool should
+\emph{do one thing and do it well}. Instead they try to capture the convex hull of all possible applications. This led
+to a complicated design that requires extensive additional specification and testing to maintain interoperability. In
+particular in the area of transport security it becomes evident that the IEC as an electrical engineering standards body
+stretched their area of expertise where resorting to established standard protocols would have led to a better
+outcome\cite{weith01}. Compared to industry-standard transport security the IEC standards provide a simplistic key
+management framework based on a static shared key with unlimited lifetime and provide sub-optimal transport security
+properties (e.g.\ lack of forward-secrecy)\cite{khurana01,sato01}.
+
+\subsection{The regulatory situation in selected countries}
+
+In this section we will give an overview of the situation in a number of countries. This list of countries is not
+representative and notably does not include any developing countries and is geographically biased. We selected these
+countries for illustration only and based our selection in a large part on the availability of information in a language
+we can read. We will conclude this section with a summary of common themes.
+
+\subsubsection{Germany}
+
+Germany standardized smart metering on a national level. Apart from the calibration standards applying to any type of
+meter smart meters are covered by a set of communications and security standards developed by the German Federal Office
+for Information Security (BSI). Germany mandates smart meter installations for newly constructed buildings and during
+major renovations but does not require most legacy residential installations to be upgraded. This is a consequence of a
+2013 cost-benefit analysis that found these upgrades to be uneconomical for the majority of residential
+customers\cite{bmwi03,bmwi1,bmwe01,brown01}.
+
+The German standards strictly separate between metering and communication functions. Both are split into separate
+devices, the \emph{meter} and the \emph{gateway} (called \emph{smart meter gateway} in full and often abbreviated
+\emph{SMGW}). One or several meters connect to a gateway through a COSEM-derived protocol. The communication interface
+between meter and gateway can optionally be physically unidirectional. An unidirectional interface eliminates any
+possibility of meter firmware compromise. The gateway contains a cryptographic security module similar to a
+smartcard\cite{mahlknecht01} that is entrusted with signing of measurements and maintaining an authenticated and
+encrypted communication channel with its authorities. Security of the system is certified according to a Common Criteria
+process.
+
+The German specification does not include any support for disconnect switches as they are common in some other countries
+outside of demand-side management. It only does not prohibit the installation of one behind the smart meter
+installation. This makes it theoretically possible for a utility company to still install a disconnect switch to
+disconnect a customer, but this would be a spearate installation from the smart meter. In Germany there are significant
+barriers that have to be met before a utility company may cut power to a household\cite{delaw01}. The elision of a
+disconnect switch means attacks on German meters will be limited in influence to billing irregularities and attacks
+using DSM equipment such as water storage heaters that represent only a fraction of overall load.
+
+\subsubsection{The Netherlands}
+The Netherlands were early to take initiative to roll out smart metering after its recognition by the European
+Commission in 2006\cite{cuijpers01,ec04}. After overcoming political issuses the Netherlands were above the European
+median in 2018, having replaced almost half of all meters\cite{cuijpers01,ec03}. Dutch smart meters are standardized by
+a consortium of distribution system operators. They integrate gateway and metrology functions into one device. The
+utility-facing interface is a IEC DLMS/COSEM-based interface over cellular radio such as GPRS or LTE\cite{aubel01}. Like
+e.g.\ the German standard, the Dutch standard precisely specifies all communication interfaces of the
+meter\cite{dsmrp3}. Another parallel is that the Dutch standard also does not cover any functionality for remotely
+disconnecting a household. This absence of a disconnect switch limits attacks on Dutch smart meters, too to causing
+billing irregularities.
+
+\subsubsection{The UK}
+
+The UK is currently undergoing a smart metering rollout. Meters in the UK are nationally standardized to provide both
+Zigbee ZSE-based and IEC DLMS/COSEM connectivity. UK smart metering specifications are shared between electrical and gas
+meters. Different to other countries' specifications the UK national specifications require electrical meters to have an
+integrated disconnect switch and gas meters to have an integrated valve. In Northern Ireland most consumers use prepaid
+electricity contracts\cite{anderson02}. Prepayment and credit functionality are also specified in the UK's national
+smart metering standard, as is remote firmware update functionality\cite{ukgov02}. Outside communications in these
+standards is performed through a gateway (there called \emph{communications hub}) that can be shared between several
+meters \cite{ukgov01,ukgov02,ukgov03,brown01,sato01}. The combination of both gas and electricity metering into one
+family of standards and the exceptionally large set of \emph{required} features make the UK regulations the maximalist
+option among the regulations in this section. The mandatory inclusion of both disconnect switches and remote
+connectivity up to remote firmware update make it an interesting attack target\cite{anderson01}.
+
+\subsubsection{Italy}
+
+Italy was among the first countries to legally mandate the widespread installation of smart meters in households. Italy
+in 2006 and 2007 by law set a starting date for the rollout in 2008\cite{brown01}. The Italian electricity market was
+recently privatized. While the wholesale market and transmission network privatization has advanced the vast majority of
+retail customers continued to use the incumbent distribution system operator ENEL as their supplier\cite{ec03}. This
+dominant position allowed ENEL to orchestrate the large-scale rollout of smart meters in Italy. Almost every meter in
+Italy had been replaced by a smart meter by 2018\cite{ec03}. An unique feature of the Italian smart metering
+infrastructure is that it relies on Power Line Communication (PLC) to bridge distances between meters and cellular radio
+gateways\cite{gungor01}.
+
+\subsubsection{Japan}
+
+Japan is currently rolling out smart metering infrastructure. Compared to other countries in Japan significant
+standardization effort has been spent on smart home integration\cite{usitc01,sato01,brown01}. Japan has domestic
+standards under its Japanese Industrial Standards organization (JIS) that determine metrology and physical dimensions.
+Tokyo utility company TEPCO is currently rolling out a deployment that is based on the IEC DLMS/COSEM standards suite
+for remote meter reading in conjuction with the Japanese ECHONET home-area network protocol. Smart meters are
+connected to TEPCO's backend systems through the customer's internet connection, sub-gigahertz radio based on 802.15.4
+framing, regular landline internet or PLC\cite{toshiba01,sato01}.
+
+A unique point in the Japanese utility metering landscape is that the current practice is monthly manual readings. In
+Japan residential utility meters are usually mounted outside the building on an exterior wall and every month someone
+with a mirror on a long stick will come and read the meter. The meter reader then makes a thermal paper print-out of the
+updated utility bill and puts it into the resident's post box. This practice gives consumers good control over their
+consumption but does incur significant personnel overhead.
+
+\subsubsection{The USA}
+
+In the USA the rollout of smart meters has been promoted by law as early as 2005. The US electricity market is highly
+complex with states having significant authority to decide on their own policies\cite{brown01}. Originally different
+from the IEC standards used in large fraction of the rest of the world the USA developed their own domestic set of
+standards for smart meters under the Americal National Standards Institute (ANSI)\cite{sato01}. Today ANSI is converging
+with the IEC on the protcol layer. An obvious feature of ANSI-standard meters is that they are round and plug into a
+wall-mounted socket while IEC devices are usually rectangular and connected directly to the mains wiring through large
+screw terminals\cite{ifixit01}.
+
+\subsection{Common themes}
+
+Researching the current situation around the world for the above sections we were able to distill some common themes.
+First, smart metering is slowly advancing on a global scale and despite significant reservations from privacy-conscious
+people and consumer advocates it seems it is here to stay. Still, there are some notable exceptions of countries that
+have decided to scale-back an ongoing rollout effort after subsequent analysis showed economical or other
+issues\footnote{cf.\ the Netherlands and Germany}.
+
+\subsubsection{The introduction of smart metering}
+
+The smart meter rollout is largely driven by utility companies. Utility companies field a variety of arguments for the
+rollout. The most prominent argument is a general increase in energy-efficiency along with a reduction of emissions.
+This argument is based on the estimation that smart metering will increase private customers' awareness of their own
+consumption and this will lead them to reduce their consumption. The second highly popular argument for smart metering
+is that it is necessary for the widespread adoption of renewable energies. This argument again builds on the trend
+towards green energy to rationalize smart metering. Interestingly this argument is often formulated as an inevitability
+instead of a choice.
+
+Academic reception of smart metering is dyed with an almost unanimous enthusiasm. In particular smart meter
+communication infrastructure has received a large amount of research
+attention\cite{dzung01,gungor01,kabalci01,lloret01,mahmood01,yan01,anderson01,anderson02}. Outside of human-computer
+interaction claims that smart meters will reduce customer energy consumption have often been uncritically accepted.
+
+\subsubsection{Standardization and reality of smart devices}
+
+Regulators, utilities and academics meet in their enthusiasm on the issue of smart home integration of smart metering. A
+feature of many concepts is that the meter acts as the centerpiece of a modern, fully integrated smart
+home\cite{aubel01,geelen01,bsi-tr-03109-1,abdallah01}. The smart meter serves as a communication hub between a new class
+of grid-aware loads and the utility company's control center. Large (usually thermal) loads such as dishwashers,
+refrigerators and air conditioners are expected to intelligently adapt their heating/cooling cycles to better match
+the grid's supply. A frequent scenario is one in which the meter bills the customer using near-real time pricing, and
+supplies large loads in the customer's household with this pricing information. These loads then intelligently schedule
+their operation to minimize cost\cite{sato01}. At the time between 2000 and 2005 when smart metering proposals were
+first advanced this vision might have been an effect of the \emph{law of the instrument}\cite{kaplan01,anderson02}. Back
+then outside of specialty applications household devices were not usually networked\cite{merz01}. Smart meters at the
+time may have seemed to be the obvious choice for a smart home communications hub.
+
+From today's perspective, this idea is obviously outdated. Smart \emph{things} now have found their way into many homes.
+Only these things are directly interconnected through the internet--foregoing the home-area network (HAN) technologies
+anticipated by smart metering pioneers. The simple reason for this is that nowadays anyone has Wifi, and Wifi
+transceivers have become inexpensive enough to disappear in the bill of materials (BOM) cost of a large home device such
+as a washing machine. Smart meters are usually situated in the basement--physically far away from most of one's devices.
+This makes connecting them to said devices awkward and connecting them via the local Wifi lends the question why the
+smart devices should not simply use the internet directly.
+
+Connecting things to a smart meter through a local bus is academically appealing. It promises cost-savings from a
+simpler physical layer (such as ZigBee instead of Wifi) and it neatly separates concerns into home infrastructure and
+the regular internet. Communication between smart meter and devices never leaves the house. This promises tolerance to
+utility backend systems breaking. It also physically keeps communication inside the house, bypassing the utility's eyes
+improving both customer privacy and agency. The presently popular model of a device as simple as a light bulb proxying
+its every action through a manufacturer's servers somewhere on the public internet is in stark contrast to this
+scenario. Alas, the reason that this model is as popular is that in most cases it simply works. Device manufacturers
+integrate one of many off-the-shelf Wifi modules. The resulting device will work anywhere on earth\footnote{For some
+places channel assignments may have to be updated. This is a configuration-level change and in some devices can be done
+by the end-user during provisioning.}. A HAN-connected device would have several variants with different modems for
+different standards. Some might work across countries, but some might not. And in some countries there might not even be
+a standard for smart grid HANs.
+
+Looking at the situation like this begs the question why this realization has not yet found its way into mainstream
+acceptance by smart metering implementors. The customer-facing functionality promised through smart meters would be
+simple to implement as part of a now-standard \emph{Internet of Things} application. An in-home display that shows
+real time energy consumption and cost statistics would simply be an Android tablet fetching summarized data from the
+utility's billing backend. Custom hardware for this purposes seems anachronistic today. Demand-side response by large
+loads would be as simple as an HTTPS request with a token identifying the customer's contract that returns the
+electricity price the meter is currently charging along with a recommendation to switch on or off. It seems the smart
+home has already arrived while smart metering is still getting off the starting blocks\cite{anderson02}.
+% TODO is this too critical? Is maybe the modern smart home compatible with smart meters? Is maybe the local-only path
+% of data, avoiding utility clouds a design feature? (may be true in DE, NL, probably not anywhere else)
+
+\section{Security in smart distribution grids}
+
+The smart grid in practice is nothing more or less than an aggregation of embedded control and measurement devices that
+are part of a large control system. This implies that all the same security concerns that apply to embedded systems in
+general also apply to most components of a smart grid. Where programmers have been struggling for decades now with input
+validation\cite{leveson01}, the same potential issue raises security concerns in smart grid scenarios as well\cite{mo01,
+lee01}. Only, in smart grid we have two complicating factors present: Many components are embedded systems, and as such
+inherently hard to update. Also, the smart grid and its control algorithms act as a large (partially-)distributed
+system making problems such as input validation or authentication harder\cite{blaze01} and adding a host of distributed
+systems problems on top\cite{lamport01}.
+
+Given that the electrical grid is essential infrastructure in our modern civilization, these problems amount to
+significant issues in practice. Attacks on the electrical grid may have grave consequences\cite{anderson01,lee01} while
+the long maintenance cycles of various components make the system slow to adapt. Thus, components for the smart grid
+need to be built to a much higher standard of security than most consumer devices to ensure they live up to well-funded
+attackers even decades down the road. This requirement intensifies the challenges of embedded security and distributed
+systems security among others that are inherent in any modern complex technological system. The safety-critical nature
+of the modern smart metering ecosystem in particular was quickly recognized by security experts\cite{anderson01}.
+
+A point we will not consider in much depth in this work is theft of electricity. An incentive for the introduction of
+smart metering that is frequently cited in utility industry publications outside of a general public's view is the
+reduction of electricity theft\cite{czechowski01}. Academic publications tend to either focus on other benefits such as
+generation efficiency gains through better forecasting or rationalize the consumer-unfriendly aspects of smart metering
+with ``enormous social benefits''\cite{mcdaniel01}. They do not usually point out the economical incentive such
+\emph{revenue protection} mechanisms provide\cite{anderson01,anderson02}.
+
+\subsection{Privacy in the smart grid}
+
+A serious issue in smart metering setups is customer privacy. Even though the meter ``only'' collects aggregate energy
+consumption of a whole household this data is highly sensitive\cite{markham01}. This counterintuitive fact was initially
+overlooked in smart meter deployments leading to outrage, delays and reduced features\cite{cuijpers01}. The root cause
+of this problem is that given sufficient timing resolution these aggregate measurements contain ample entropy. Through
+disaggregation algorithms individual loads can be identified and through pattern matching even complex usage patterns
+can be discerned with alarming accuracy\cite{greveler01}. Similar privacy issues arise in many other areas of modern
+life through pervasive tracking and surveillance\cite{zuboff01}. What makes the case of smart metering worse is that
+even the fig leaf of consent such practices often hide behind does not apply here. If a citizen does not consent to
+Google's privacy policy Google says they can choose not to use their service. In today's world this may not be a free
+choice thereby invalidating this argument but it is at least technically possible. Smart metering on the other hand is
+mandated by law and depending on the law a customer unwilling to accept the accompanying privacy violation may not be
+able to evade it\cite{bmwi04}.
+
+\subsection{Smart grid components as embedded devices}
+
+A fundamental challenge in smart grid implementations is the central role smart electricity meters play. Smart meters
+are used both for highly-granular load measurement and (in some countries) load switching\cite{zheng01}. Smart
+electricity meters are effectively consumer devices. They are built down to a certain price point that is measured by
+the burden it puts on consumers. The cost of a smart meter is ultimately limited by it being a major factor in the
+economies of a smart meter rollout\cite{bmwi03}. Cost requirements preclude some hardware features such as the use of a
+standard hardened software environment on a high powered embedded system (such as a hypervirtualized embedded linux
+setup) that would both increase resilience against attacks and simplify updates. Combined with the small market sizes in
+smart grid deploymentsthis results in a high cost pressure on the software development process for smart electricity
+meters. Most vendors of smart electricity meters only serve a handful of markets. A large fraction of smart meter
+development cost lies in the meter's software. Landis+Gyr, a large manufacturer that makes most of its revenue from
+utility meters in their 2019 annual report write that they \SI{36}{\percent} of their total R\&D budget on embedded
+software (firmware) while spending only \SI{24}{\percent} on hardware R\&D\cite{landisgyr01,landisgyr02}. There exist
+multiple competing standards applicable to various parts of a smart electricity meter and most countries have their own
+certification regimen\cite{cenelec01}. This complexity creates a large development burden for new market
+entrants\cite{perez01}.
+
+\subsection{The state of the art in embedded security}
+
+Embedded software security generally is much harder than security of higher-level systems. This is due to a combination
+of the unique constraints of embedded devices: Among others they are hard to update and usually produced in small
+quantities. They also lack capabilities compared to full computers. Processing power is limited and memory protection
+functions are spartan. Even well-funded companies continue to have trouble securing their embedded
+systems. A spectacular example of this difficulty is the recently-exposed flaw in Apple's iPhone SoC first-stage ROM
+bootloader\footnote{
+ Modern system-on-chips integrate one or several CPUs with a multitude of peripherals, from memory and DMA
+ controllers over 3D graphics accelerators down to general-purpose IO modules for controlling things like indicator
+ LEDs. Most SoCs boot from one of several boot devices such as flash memory, Ethernet or USB according to a
+ configuration set by pin-strapping configuration IOs or through write-only fuse bits.
+
+ Physically, one of the processing cores of the SoC (usually one of the main CPU cores) is connected such that it is
+ taken out of reset before all other devices, and is tasked with enabling and configuring all other peripherals of
+ the SoC. In order to run later intialization code or more advanced bootloaders, this core on startup runs a very
+ small piece of code hard-burned into the SoC in the factory. This ROM loader initializes the most basic peripherals
+ such as internal SRAM memory and selects a boot device for the next bootloader stage.
+
+ Apple's ROM loader measures only a few hundred bytes. It performs authorization checks to ensure only software
+ authorized by Apple is booted. The present flaw allows an attacker to circumvent these checks and boot their own
+ code on a USB-connected iPhone. This compromises Apple's chain of trust from ROM loader to userland right at its
+ root. Since this is a flaw in the factory-programmed first stage read-only boot code of the SoC it cannot be patched
+ in the field.
+}, that allows a full compromise of any iPhone before the iPhone X. iPhone 8, one of the affected models, was still
+being manufactured and sold by Apple until April 2020. In another instance in 2016 researchers found multiple flaws in
+the secure-world firmware used by Samsung in their mobile phone SoCs. The flaws they found were both severe
+architectural flaws such as secret user input being passed through untrusted userspace processes without any protection
+and shocking cryptographic flaws such as
+CVE-2016-1919\footnote{\url{http://cve.circl.lu/cve/CVE-2016-1919}}\cite{kanonov01}. And Samsung is not the only large
+multinational corporation having trouble securing their secure world firmware implementation. In 2014 researchers found
+an embarrassing integer overflow flaw in the low-level code handling untrusted input in Qualcomm's QSEE
+firmware\cite{rosenberg01}. For an overview of ARM TrustZone including a survey of academic work and past security
+vulnerabilities of TrustZone-based firmware see \cite{pinto01}.
+
+For their mass-market phones these companies have R\&D budgets that dwarf some countries' national budgets. If even
+they have trouble securing their secure embedded software stacks, what is a smart meter manufacturer to do? If a
+standard as in case of the German one requires IP gateways to speak TLS, a protocol that is notoriously tricky to
+implement correctly\cite{georgiev01}, the manufacturer is short on options to secure their product.
+
+Since thorough formal verification of code is not yet within reach for either large-scale software development or code
+heavy in side-effects such as embedded firmware or industrial control software\cite{pariente01} the two most effective
+measures for embedded security are reducing the amount of code on one hand, and labor-intensively reviewing and testing
+this code on the other hand. A smart meter manufacturer does not have a say in the former since it is bound by the
+official regulations it has to comply with, and will likely not have sufficient resources for the latter. We are left
+with an impasse: Manufacturers in this field likely do not have the security resources to keep up with complex standards
+requirements. At the same time they have no option to reduce the scope of their implementation to alleviate the burden
+on firmware security.
+
+\subsection{Attack avenues in the smart grid}
+
+If we model the smart grid as a control system responding to changes in inputs by regulating outputs, on a very high
+level we can see two general categories of attacks: Attacks that directly change the state of the outputs, and attacks
+that try to influence the outputs indirectly by changing the system's view of its inputs. The former would be an attack
+such as shutting down a power plant to decrease generation capacity\cite{lee01}. The latter would be an attack such as
+forging grid frequency measurements where they enter a power plant's control systems to provoke the control systems to
+oscillate\cite{kosut01,wu01,kim01}.
+
+\subsubsection{Communication channel attacks}
+
+Communication channel attacks are attacks on the communication links between smart grid components. This could be
+attacks on IP-connected parts of the core network or attacks on shared busses between smart meters and IP gateways in
+substations. Generally, these attacks can be mitigated by securing the aforementioned communication links using modern
+cryptography. IP links can be protected using TLS, and more low-level busses can be protected using more lightweight
+Noise\cite{perrin01}-based protocols.
+
+Cryptographic security transforms an attackers ability to read and manipulate communication contents into a mere denial
+of service attack. Thus, in addition to cryptographic security safety under DoS conditions must be ensured for continued
+system performance under attacks. This safety property is identical with the safety required to withstand random outages
+of components, such as communication link outages due to physical damage from storms, flooding etc\cite{sato01}. In
+general attacks at the meter level are hard to weaponize. Meters primarily serve billing purposes. The use of smart
+meter data for load forecasting is not yet common practice. Once it is this data will only be used to refine existing
+forecasting models that are based on aggregate data collected at higher vantage points in the distribution grid. This
+combination of smart metering data with more trusted aggregate data from sensors within the grid infrastructure limits
+the potential impact of a data falsification attack on smart meters. It also allows the utility to identify potentially
+corrupt meter readings and thus detect manipulation above a certain threshold. In order for an attack to have more
+far-reaching consequences the attacker would need to compromise additional grid infrastructure\cite{kim01,kosut01}.
+
+\subsubsection{Exploiting centralized control systems}
+
+The type of smart grid attack most often cited in popular discourse, and to the author's knowledge the only type that
+has so far been carried out in practice, is a direct attack on centralized control systems. In this attack, computer
+components of control systems are compromised by the same techniques used to compromise any other kind of computer
+system such as spearfishing, exploiting insecure services running on internet-exposed ports and using one compromised
+system to compromise other systems on the same ostensably secure internal network. These attacks are very powerful as
+they yield the attacker direct control over whatever outputs the compromised control systems are controlling. If an
+attacker manages to compromise the right set of control computers, they may even be able to cause physical
+damage\cite{lee01}.
+
+Despite their potentially large impact, these attacks are only moderately interesting from a scientific perspective. For
+one, their mitigation mostly consists of a straightforward application of decades-old security best practices. Though
+there is room for the implementation of genuinely new, power systems-specific security systems in this field, the general
+state of the art is lacking behind other fields of embedded security. From this background low-hanging fruit should take
+priority\cite{heise02}. Given political will these systems can readily be fortified. There is only a comparatively
+small number of them and having a technician drive to every one of them in turn to install a firmware security update is
+feasible.
+
+\subsubsection{Control function exploits}
+
+Control function exploits are attacks on the mathematical control loops used by the centralized control system. One
+example of this type of attack are resonance attacks as described in \cite{wu01}. In this kind of attack, inputs from
+peripheral sensors indicating grid load to the centralized control system are carefully modified to cause a
+disproportionately large oscillation in control system action. This type of attack relies on complex resonance effects
+that arise when mechanical generators are electrically coupled. These resonances, colloquially called ``modes'', are
+well-studied in power system engineering\cite{rogers01,grebe01,entsoe01,crastan03}. Even disregarding modern attack
+scenarios, for stability electrical grids are designed with measures in place to dampen any resonances inherent to grid
+structure. These resonances are hard to analyze since they require an accurate grid model and they are unlikely to be
+noticed under normal operating conditions.
+
+Mitigation of these attacks can be achieved by ensuring unmodified sensor inputs to the control systems in the first
+place. Carefully designing control systems not to exhibit exploitable behavior such as oscillations is also possible but
+harder.
+
+\subsubsection{Endpoint exploits}
+
+The one to us rather interesting attack on smart grid systems is someone exploiting the grid's endpoint devices such as
+smart electricity meters. These meters are deployed on a massive scale, with at least one meter per household on
+average\footnote{Households rarely share a meter but some households may have a separate meter for detached properties
+such as a detached garage or basement.}. Once compromised, restoration to an uncompromised state can be difficult if it
+requires physical access to thousands of devices in hard-to-access locations.
+
+By compromising smart electricity meters, an attacker can forge the distributed energy measurements these devices
+perform. In a best-case scenario, this might only affect billing and lead to customers being under- or over-charged if
+the attack is not noticed in time. In a less ideal scenario falsified energy measurements reported by these devices
+could impede the correct operation of centralized control systems.
+
+In some countries such as the UK smart meters have one additional function that is highly useful to an attacker: They
+contain high-current disconnect switches to disconnect the entire household or business in case electricity bills are
+left unpaid for a certain period. In countries that use these kinds of systems on a widespread level, the load
+disconnect switch is controlled by the smart meter's central microcontroller. This allows anyone compromising this
+microcontroller's firmware to actuate the disconnect switch at will. Given control over a large number of
+network-connected smart meters, an attacker might thus be able to cause large-scale disruptions of power
+consumption\cite{anderson01,temple01}. Combined with an attack method such as the resonance attack from \cite{wu01}
+that was mentioned above, this scenario poses a serious threat to grid stability.
+
+In places where Demand-Side Management (DSM) is common this functionality may be abused in a similar way. In DSM the
+smart metering system directly controls power to certain devices such as heaters. The utility can remotely control the
+turn-on and turn-off of these devices to smoothen out the load curve. In exchange the customer is billed a lower price
+for the energy consumed by these loads. DSM was traditionally done in a federated fashion usually through low-frequency
+PLC over the distribution grid\cite{dzung01}. Smart metering systems no longer require large, resource-intensive
+transmitters in substations and bear the potential for a rollout of such technology on a much wider scale than before.
+This leads to a potentially significant role of DSM systems in the impact calculation of an attack on a smart metering
+system. DSM does not control as much load capacity as remote disconnect switches do but the attacks cited in the above
+paragraph still fundamentally apply.
+
+\subsection{Practical threats}
+
+As a highly integrated system the electrical grid is vulnerable to attacks from several angles. One way to classify
+attacks is by their motivation. Along this axis we found the following motives:
+
+\begin{description}
+ \item[Service disruption.] An attack aimed at disrupting service could e.g.\ aim at causing a blackout. It could
+ also take aim in a more subtle way targeting a degradation of parameters such as power quality (voltage,
+ frequency and waveform). It could target a particular customer, geographic area or all parts of the grid.
+ Possible motivations range from a tennage hacker's boredom to actual cyberwar\cite{cleveland01,lee01}.
+ \item[Commercial disruption.] Simple commercial motives already motivate a wide variety of attacks on grid
+ infrastructure\cite{czechowski01}. Though generally mostly harmless from a cypersecurity point of view there are
+ instances where these attacks put the lives of both the attacker and bystanders at grave risk\cite{anderson01}.
+ Such attacks generally aim at the meter itself but a more sophisticated attacker might also target the
+ utility's backend computer bureaucracy.
+ \item[Data extraction.] The smart grid collects large amounts of data on both individual consumers and on an
+ aggregate level. The privacy risk in individual consumer's data is obvious. On the web
+ data collection practices ranging from questionable to flat-out illegal have widely proliferated for various
+ purposes including election manipulation\cite{heise03}. Assuming criminals in this field would eschew
+ fertile ground such as this due to legal or ethical concerns is optimistic. Taking the risk to individual
+ customer's data out of the equation even aggregate data is still highly attractive to some. Aggregate real-time
+ electricity usage data is a potential source on timely information on matters such as national social events
+ (through TV set energy consumption\cite{greveler01}) or the state of the economy.
+\end{description}
+
+A factor to consider in all these cases is that one actor's attacks have the potential to weaken system security
+overall. An attacker might add new backdoors to gain persistence or they might disable existing mitigations to enable
+further steps of their attack.
+
+In this paper we will largely concentrate on attacks of the first type because they both have the most serious
+consequences and the most motivated attackers. Attackers that may want to disrupt service include nation state's
+cyberwar operations. This type of attacker is both highly skilled and highly funded.
+
+\subsection{Conclusion or, why we are doomed}
+
+We can conclude that a compromise of a large number of smart electricity meters cannot be ruled out. The complexity of
+network-connected smart meter firmware makes it exceedingly unlikely that it is in fact flawless. Large-scale
+deployments of these devices sometimes with disconnect relays make them an attractive target for attackers interested in
+causing grid instability. The attacker model for these devices includes nation states, who have considerable resources
+at their disposal.
+
+For a reasonable guarantee that no large-scale compromises of hard- and software built today will happen over a span of
+some decades, we would have to radically simplify its design and limit attack surface. Unfortunately, the complexity of
+smart electricity meter implementations mostly stems from the large list of requirements these devices have to conform
+with. Alas, the standards have already been written, political will has been cast into law and changes that reduce scope
+or functionality have become exceedingly unlikely at this point.
+
+A general observation with smart grid systems of any kind is that they comprise a departure from the federated
+control structure of yesterday's ``dumb'' grid and the advent of centralization to an enormous scale. This modern,
+centralized infrastructure has been carefully designed to defend against malicious actors and all involved parties have
+an interest in keeping it secure but in centralized systems scaling attacks is inherently easier than in decentralized
+systems\cite{anderson02}. An attacker can employ centralized control to their advantage. From this perspective the
+centralization of smart metering control systems--sometimes up to a national level\cite{anderson01,anderson02}--poses a
+security risk.
+
+\chapter{Restoring endpoint safety in an age of smart devices}
+
+As laid out in the previous section we cannot fully rule out a large-scale compromise of smart energy meters at some
+point in the long-term future. Instead we have to rephrase our claim to security. We cannot rule out exploitation: We
+have to limit its impact. Assuming that we cannot strip any functionality from smart meters all we can do is to flush
+out an attacker once they are in. Mitigation replaces prevention.
+
+In a worst-case scenario an attacker would gain unconstrained code execution e.g.\ by exploiting a flaw in a network
+protocol implentation. Smart meters use standard microcontrollers that do not have advanced memory protection functions
+(cf.\ Section \ref{sm-cpu}). We can assume the attacker has full control over the main microcontroller given any such
+flaw. With this control they can actuate the disconnect switch if present. They can transmit data through the device's
+communication interfaces or use the user interface components such as LEDs and the LCD. Using the self-programming
+capabilities of flash microcontrollers an attacker could even gain persistency. Note that in systems separating
+cryptographic functions into some form of cryptographic module\footnote{such as systems used in
+Germany\cite{bsi-tr-03109}.} we can be optimistic and assume the attacker has not yet compromised this cryptographic
+co-processor.
+
+With the meter's core microcontroller under attacker control we cannot use this microcontroller to restore control over
+the system. We have no way of ensuring the attacker does not simply delete a security mechanism we include in the core
+microcontroller's firmware. Theoretically a secure boot implementation could be used to ensure meters boot into a safe
+state after temporary power loss but we cannot rely on secure boot being present on every smart meter application
+controller. Nowadays secure boot is a standard feature in many SoC aimed at smartphones or smart TVs but it is still
+very uncommon in microcontrollers.
+
+Our solution to this problem is to add another smaller microcontroller to the smart meter design. This microcontroller
+will contain a small piece of software that receives cryptographically authenticated commands from utility companies. On
+demand it can reset the meter's core microcontroller to a known-good state. To reliably flush out an attacker from a
+compromised core microcontroller we re-program the core microcontroller in its entirety. We propose using JTAG to
+re-program the core microcontroller with a known-good firmware image read from a sufficiently large SPI flash connected
+to the reset controller. JTAG is supported by most microcontrollers complex enough to be used in a smart meter design.
+JTAG programming functionality can be ported to a new microcontroller with relatively little work.
+
+Our solution requires the core mircocontroller's JTAG interface to be activated (i.e. not fused-shut). For our solution
+to work the core microcontroller firmware must not be able to permanently disable the JTAG interface by itself. In
+microcontrollers that do not yet provide this functionality this is a minor change that could be added to a custom
+microcontroller variant at low cost. On most microcontrollers keeping JTAG open should not interfere with code readout
+protection\footnote{Readout protection usually forces a device to erase its program and data memories before allowing
+JTAG access.}. Code secrecy should be of no concern\cite{schneier01} here but some manufacturers have strong preferences
+due to a fear of copyright infringement.
+
+\section{The theory of endpoint safety}
+\label{sec_criteria}
+
+In order to gain anything by adding our reset controller to the smart meter's already complex design we must satisfy two
+interrelated conditions.
+\begin{enumerate}
+\item \emph{security} means our reset controller itself does not have any remotely exploitable flaws
+\item \emph{safety} menas our reset controller will perform its job as intended
+\end{enumerate}
+
+Note that our \emph{security} property includes only remote exploitation, and excludes any form of hardware attack.
+Even though most smart meters provide some level of physical security, we do not wish to make any assumptions on this.
+In the following section we will elaborate our attacker model and it will become apparent that sufficient physical
+security to defend against all attackers in our model would be infeasible, and thus we will design our overall system
+to remain secure even if we assume some number of physically compromised devices.
+
+\subsection{Attack characteristics}
+The attacker model the two above conditions must hold under is as follows. We assume three angles of attack: Attacks by the
+customer themselves, attacks by an insider within the metering systems controlling utility company and lastly attacks
+from third parties. Examples for these third parties are hobbyist hackers or outside cybercriminals on the one hand,
+but also other companies participating in the smart grid infrastructure besides the utility company such as intermediary
+providers of meter-reading services.
+
+Due to the critical nature of the electrical grid, we have to include hostile state actors in our attacker model. When
+acting directly, these would be classified as third-party attackers by the above schema, but they can reasonably be
+expected to be able to assume either of the other two roles as well e.g. through infiltration or bribery. In the
+generalized attacker model in \cite{fraunholz01} the authors give a classification of attacker types and provide a nice
+taxonomy of attacker properties. In their threat/capability rating, criminals are still considered to have higher threat
+rating than state-sponsored attackers. The New York Times reported in 2016 that some states recruit their hacking
+personnel in part from cybercriminals. If this report is true, in a worst-case scenario we have to assume a
+state-sponsored attacker to be the worst of both types. Comparing this against the other attacker types in
+\cite{fraunholz01}, this state-sponsored attacker is strictly worse than any other type in both variables. We are left
+with a highly-skilled, very well-funded, highly intentional and motivated attacker.
+
+Based on the above classification of attack angles and our observations on state-sponsored attacks, we can adapt
+\cite{fraunholz01} to our problem, yielding the following new attacker types:
+
+\begin{enumerate}
+ \item \textbf{Utility company insiders controlled by a state actor.}
+ We can ignore the other internal threats described in \cite{fraunholz01} since an insider coöperating with a
+ state actor is strictly worse in every respect.
+ \item \textbf{State-sponsored external attackers.}
+ A state actor can directly attack the system through the internet and with proper operations security they do
+ not risk exposure or capture.
+ \item \textbf{Customers controlled by a state actor.}
+ A state actor can very well compromise some customers for their purposes. They might either physically
+ infiltrate the system posing as legitimate customers, or they might simply deceive or bribe existing customers
+ into coöperation.
+ \item \textbf{Regular customers.}
+ A hostile state actor might gain control of some number of customers through means such as voluntary
+ coöperation, bribery or infiltration but this limits the scale of an attack since an attacker has to avoid
+ arousing premature attention. Though regular customers may not have the motivation, skill or resources of a
+ state-sponsored attacker, potentially large numbers of them may try to attack a system out of financial
+ incentives\cite{anderson01,czechowski01}. To allow for this possibility, we consider regular customers separate
+ from state actors posing as customers.
+\end{enumerate}
+
+\subsection{Overall structural system security}
+
+Considering overall security, we first introduce the reset authority, a trusted party acting as the single authority for
+issuing reset commands in our system. In practice this trusted party may be part of the utility company, part of an
+external regulatory body or a hybrid setup requiring both to coöperate. We assume this party will be designed to be
+secure against all of the above attacker types. The precise design of this trusted party is out of scope for this work
+but we will provide some practical suggestions on how to achieve security below in Section \ref{sec-regulation}.
+
+Using an asymmetric cryptographic design centered around the reset authority, we rule out all attacks except for
+denial-of-service attacks on our system by any of the four attacker types. All reset commands in our system originate
+from the reset authority and are cryptographically secured to provide authentication and tamper detection. Under this
+model attacks on the electrical grid components between the reset authority and the customer device degrade into denial
+of service attacks. To ensure the \emph{safety} criterion from Section \ref{sec_criteria} holds we must make sure our
+cryptography is secure against man-in-the-middle attacks and we must try to harden the system against denial-of-service
+attacks by the attacker types listed above. Given our attacker model we cannot fully guard against this sort of attack
+but we can at least choose a communication channel that is resilient under the above model.
+
+Finally, we have to consider the issue of hardware security. We will solve the problem of physical attacks by simply not
+programming any secret information into devices. This also simplifies hardware production. We consider supply-chain
+attacks out-of-scope for this work.
+
+\subsection{Complex microcontroller firmware}
+
+The \emph{security} property from \ref{sec_criteria} is in a large part reliant on the security of our reset
+controller firmware. The best method to increase firmware security is to reduce attack surface by limiting external
+interfaces as much as possible and by reducing code complexity as much as possible. If we avoid the complexity of most
+modern microcontroller firmware we gain another benefit beyond implicitly reduced attack surface: If the resulting
+design is small enough we may even succeed in formal verification of our security properties. Though formal
+verification tools are not yet suitable for highly complex tasks they are already adequate for small amounts of code and
+simple interfaces.
+
+\subsection{Modern microcontroller hardware}
+
+Microcontrollers have gained enormously in both performance and efficiency as well as in peripheral support. Alas, these
+gains have largely been driven by insatiable customer demand for faster, more powerful chips and for the longest time
+security has not been considered important outside of some specific niches such as smartcards. A few years ago a
+microcontroller would spend its entire lifetime without ever being exposed to any networks\cite{anderson02}. Though this
+trend has been reversing with the increasing adoption of internet-of-things things and more advanced security features
+have started appearing in general-purpose microcontrollers, most still lack even basic functionality found in processors
+for computers or smartphones.
+
+One of the components lacking from most microcontrollers is strong memory protection or even a memory mapping unit as it
+is found in all modern computer processors and SoCs for applications such as smartphones. Without an MPU (Memory
+Protection Unit) or MMU (Memory Management Unit) many memory safety mitigations cannot be implemented. This and the
+absence of virtualization tools such as ARM's TrustZone make hardening microcontroller firmware a big task. It is very
+important to ensure memory safety in microcontroller firmware through tools such as defensive coding, extensive testing
+and formal verification.
+
+In our design we achieve simplicity on two levels: One, we isolate the very complex metering firmware from our reset
+controller by having both run on separate microcontrollers. Two, we keep the reset controller firmware itself extremely
+simple to reduce attack surface there. Our protocol only has one message type and no state machine.
+
+\subsection{Safety vs. security: Opting for restoration instead of prevention}
+
+By implementing our reset system as a physically separate microcontroller we sidestep most security issues around the
+main application microcontroller. There are some simple measures that can be taken to harden its firmware.
+Implementing industry best practices such as memory protection or stack canaries will harden the system and increase the
+cost of an attack but it will not yield a system that we can be confident enough in to say it is fully secure. The
+complexity of the main application controller firmware makes fully securing the system a formidable effort--and one that
+would have to be repeated by every meter vendor for every one of their code bases.
+
+In contrast to this our reset system does not provide any additional security. Any attack that could occur without it
+can still occur with it in place. What it provides is a fail-safe mechanism that can quickly immobilize a malicious
+actor mid-attack. It does this in a way that can be adapted to any meter architecture and any microcontroller platform
+with low effort since it relies on established standard interfaces such as JTAG and SWD. Concentrating research and
+development resources on a single platform like this allows for a system that is more economical to implement across
+device series and across vendors.
+
+Attack resilience in the power grid can benefit from a safety-focused approach. The greater threat such an attack poses
+is not the temporary denial of service of utility metering functions. Even in a highly integrated smart grid as
+envisioned by utility companies these measurement functions are used by utility companies to increase efficiency and
+reduce cost but are not necessary for the grid to function at all. Thus if we can provide mere \emph{safety} with a
+fail-safe semantic instead of unattainable perfect \emph{security} we have gained resilience against a large class of
+realistic attack scenarios.
+
+\subsection{Technical outline of a safety reset system}
+
+There are several ways our system could be practically implemented. The most basic way is to add a separate
+microcontroller connected to the meter's main application MCU and optionally other embedded microcontrollers such as
+modems. This discrete chip could either be placed on the metering board itself or it could be placed on a separate PCB
+connected to the programming interface(s) of the metering board. In certain cases the latter might allow its use in
+otherwise unmodified legacy designs.
+
+The safety reset controller would be a much simpler MCU than the meter's main application controller. Its software can
+be kept simple leading to low program flash and RAM requirements. Since it does not need to address rich periphery such
+as external parallel memory, LCDs etc.\ it can be a physically small, low-pin count device. If the main application
+controller is supposed to be reset to a full factory image with little or no reduced functionality its firmware image
+size is certainly too large for the reset controller's embedded flash. Thus a realistic setup would likely use an
+external SPI flash chip to store this image.
+
+The most likely interfaces to reset the main application controller and possibly other microcontrollers such as modem
+chips would be the controller's integrated programming port such as JTAG. Parallel high-voltage flash programming has
+come to be uncommon in modern microcontrollers and most nowadays use some form of a serial interface. There exist a
+variety of serial programming and debug interfaces but JTAG has grown to be by far the most broadly supported one and
+has largely displaced vendor-specific debug interfaces except for very small devices.
+
+The kind of microcontroller that would likely be used as the main application controller in a smart meter application
+will almost certainly support JTAG. These microcontrollers are high pin-count devices since they need to connect to a
+large set of peripherals such as the LCD and the large program flash makes it likely for a proper debugging interface to
+be present. The one remaining issue in this coarse technical outline is what communication interface should be used to
+transmit the trigger command to the reset controller. In the following section we will give an overview on communication
+interfaces established in energy metering applications and evaluate each of them for our purpose.
+
+\section{Communication channels on the grid}
+
+There is a number of well-established technologies for communication on or along power lines. We can distinguish three
+basic system categories: Systems using separate wires (such as DSL over landline telephone wiring), wireless radio
+systems (such as LTE) and \emph{power line communication} (PLC) systems that reüse the existing mains wiring and
+superimpose data transmissions onto the 50 Hz mains sine\cite{gungor01,kabalci01}.
+
+For our scenario, we will ignore short-range communication systems. There exists a large number of \emph{wideband}
+power line communication systems that are popular with consumers for bridging Ethernet segments between parts of an
+apartment or house. These systems transmit up to several hundred megabits per second over distances up to several tens
+of meters\cite{kabalci01}. Technologically, these wideband PLC systems are very different from \emph{narrowband}
+systems used by utilities for load management among other applications and they are not relevant to our analysis.
+
+\subsection{Power line communication (PLC) systems and their use}
+
+In long-distance communications for applications such as load management, PLC systems are attractive since they allow
+re-using the existing wiring infrastructure and have been used as early as in the 1930s\cite{hovi01}. Narrowband PLC
+systems are a potentially low-cost solution to the problem of transmitting data at small bandwidth over distances of
+several hundred meters up to tens of kilometers.
+
+Narrowband PLC systems transmit on the order of Kilobits per second or slower. A common use of this sort of system are
+\emph{ripple control} systems. These systems superimpose a low-frequency signal at some few hundred Hertz carrier
+frequency on top of the 50Hz mains sine. This low-frequency signal is used to encode switching commands for
+non-essential residential or industrial loads. Ripple control systems provide utilities with the ability to actively
+control demand while promising savings in electricity cost to consumers\cite{dzung01}.
+
+In any PLC system there is a strict trade-off between bandwidth, power and distance. Higher bandwidth requires higher
+power and reduces maximum transmission distance. Where ripple control systems usually use few transmitters to cover
+the entire grid of a regional distribution utility, higher bandwidth bidirectional systems used for automatic meter
+reading (AMR) in places such as Italy or France require repeaters within a few hundred meters of a transmitter.
+
+\subsection{Landline and wireless IP-based systems}
+
+Especially in automated meter reading (AMR) infrastructure the cost-benefit trade-off of power line systems does not
+always work out for utilities. A common alternative in these systems is to use the public internet for communication.
+Using the public internet has the advantage of low initial investment on the part of the utility company as well as
+quick commissioning. Disadvantages compared to a PLC system are potentially higher operational costs due to recurring
+fees to network providers as well as lower reliability. Being integrated into power grid infrastructure, a PLC system's
+failure modes are highly correlated with the overall grid. Put briefly, if the PLC interface is down, there is a good
+chance that power is out, too. In contrast general internet services exhibit a multitude of failures that are entirely
+uncorrelated to power grid stability. For purposes such as meter reading for billing purposes, this stability is
+sufficient. However for systems that need to hold up in crisis situations such as the recovery system we are
+contemplating in this thesis, the public internet may not provide sufficient reliability.
+
+\subsection{Short-range wireless systems}
+
+Smart meters contain copious amounts of firmware but still pale in comparison to the complexity of full-scale computers
+such as smartphones. For short-range communication between a meter and a cellular radio gateway mounted nearby or
+between a meter and a meter reading operator in a vehicle on the street a protocol such as Wifi (IEEE 802.11) is too
+complex. Absent widely-used standards in this space proprietary radio protocols grew attractive. These are often based
+on some standardized lower-level protocol such as ZigBee (IEEE 802.15) but entirely home-grown ones also exist. To the
+meter manufacturer a proprietary radio protocol has several advantages. It is easy to implement and requires no external
+certification. It can be customized to its specific application. In addition it provides vendor lock-in to customers
+sharing infrastructure such as a cellular radio gateway between multiple devices. In other fields a lack of
+standardization has led to a proliferation of proprietary protocols and a fragmented protocol landscape. This is a large
+problem since the consumer cannot easily integrate products made by different manufacturers into one system. In advanced
+metering infrastructure this is unlikely to be a disadvantage since usually there is only one distribution grid
+operator for an area. Shared resources such as a cellular radio gateway would most likely only be shared within a
+single building and usually they are all operated by the same provider.
+
+Systems in Europe commonly support Wireless M-Bus, an European standardized protocol\cite{silabs01} that operates on
+several ISM bands\footnote{
+ Frequency bands that can be used for \emph{Industrial, Scientific and Medical} applications by anyone and that do
+ not require obtaining a license for transmitter operation. Manufacturers can use whatever protocol they like on
+ these bands as long as they obtain certification that their transmitters obey certain spectral and power
+ limitations.
+}. ZigBee is another popular standard and some vendors additionally support their own proprietary protcols\footnote{
+ For an example see \cite{honeywell01}.
+}.
+
+\subsection{Frequency modulation as a communication channel}
+
+For our system, we chose grid frequency modulation (henceforth GFM) as a low-bandwidth unidirectional broadcast
+communication channel. Compared to traditional PLC, GFM requires only a small amount of additional equipment, works
+reliably throughout the grid and is harder to manipulate by a malicious actor.
+
+Grid frequency in Europe's synchronous areas is nominally 50 Hertz, but there are small load-dependent variations from
+this nominal value. Any device connected to the power grid (or even just within physical proximity of power wiring) can
+reliably and accurately measure grid frequency at low hardware overhead. By intentionally modifying grid frequency, we
+can create a very low-bandwidth broadcast communication channel. Grid frequency modulation has only ever been proposed
+as a communication channel at very small scales in microgrids before\cite{urtasun01} and to our knowledge has not yet
+been considered for large-scale application.
+
+Advantages of using grid frequency for communication are low receiver hardware complexity as well as the fact that a
+single transmitter can cover an entire synchronous area. Though the transmitter has to be very large and powerful the
+setup of a single large transmitter faces lower bureaucratic hurdles than integration of hundreds of smaller ones into
+hundreds of local systems that each have autonomous governance.
+
+\subsubsection{The frequency dependency of grid frequency}
+
+Despite the awesome complexity of large power grids the physics underlying their response to changes in load and
+generation is surprisingly simple. Individual machines (loads and generators) can be approximated by a small number of
+differential equations and the entire grid can be modelled by aggregating these approximations into a large system of
+nonlinear differential equations. Evaluating these systems it has been found that in large power grids small signal
+steady state changes in generation/consumption power balance cause an approximately linear change in
+frequency\cite{kundur01,crastan03,entsoe02,entsoe04}. \emph{Small signal} here describes changes in power balance that
+are small compared to overall grid power. \emph{Steady state} describes changes over a time frame of multiple waveform
+cycles as opposed to transient events that only last a few milliseconds.
+
+This approximately linear relationship allows the specification of a coefficient with unit \si{\watt\per\hertz} linking
+power differential $\Delta P$ and frequency differential $\Delta f$. In this thesis we are using the European power
+grid as our model system. We are using data provided by ENTSO-E (formerly UCTE), the governing association of European
+transmission system operators. In our calculations we use data for the continental European synchronous area, the
+largest synchronous area. $\frac{\Delta P}{\Delta f}$, called \emph{Overall Network Power Frequency Characteristic} by
+ENTSO-E is around \SI{25}{\giga\watt\per\hertz}.
+
+We can derive general design parameter for any system utilizing grid frequency as a communication channel from the
+policies of ENTSO-E\cite{entsoe02,entsoe03}. Any such system should stay below a modulation amplitude of
+\SI{100}{\milli\hertz} which is the threshold defined in the ENTSO-E incidents classification scale for a Scale 0-1
+(from ``Anomaly'' to ``Noteworthy Incident'' scale) frequency degradation incident\cite{entsoe02} in the continental
+Europe synchronous area.
+
+\subsubsection{Control systems coupled to grid frequency}
+
+The ENTSO-E Operations Handbook Policy 1 chapter\cite{entsoe02} defines the activation threshold of primary control to
+be \SI{20}{\milli\hertz}. Ideally, a modulation system would stay well below this threshold to avoid fighting the
+primary control reserve. Modulation line rate should likely be on the order of a few hundred Millibaud. Modulation at
+these rates would outpace primary control action which is specified by ENTSO-E as acting within between ``a few
+seconds'' and \SI{15}{\second}.
+
+Keeping modulation amplitude below this threshold would help to avoid spuriously triggering these control functions.
+The effective \emph{Network Power Frequency Characteristic} of primary control in the European grid is reported by
+ENTSO-E at around \SI{20}{\giga\watt\per\hertz}. This works out to an upper bound on modulation power of
+\SI{20}{\mega\watt\per\milli\hertz}.
+
+\subsubsection{An outline of practical transmitter implementation}
+
+In its most basic form a transmitter for grid frequency modulation would be a very large controllable load connected to
+the power grid at a suitable vantage point. A spool of wire submerged in a body of cooling liquid such as a small lake
+along with a thyristor rectifier bank would likely suffice to perform this function during occasional cybersecurity
+incidents. We can however decrease hardware and maintenance investment even further compared to this rather
+uncultivated solution by repurposing regular large industrial loads as transmitters in an emergency situation. For some
+preliminary exploration we went through a list of energy-intensive industries in Europe\cite{ec01}. The most
+electricity-intensive industries in this list are primary aluminum and steel production. In primary production raw ore
+is converted into raw metal for further refinement such as casting, rolling or extrusion. In steelmaking iron is
+smolten in an electric arc furnace. In aluminum smelting aluminum is electrolytically extracted from alumina. Both
+processes involve large amounts of electricity with electricity making up \SI{40}{\percent} of production costs. Given
+these circumstances a steel mill or aluminum smelter would be good candidates as transmitters in a grid frequency
+modulation system.
+
+In aluminum smelting high-voltage mains is transformed, rectified and fed into about 100 series-connected electrolytic
+cells forming a \emph{potline}. Inside these pots alumina is dissolved in molten cryolite electrolyte at about
+\SI{1000}{\degreeCelsius} and electrolysis is performed using a current of tens or hundreds of Kiloampère. The resulting
+pure aluminum settles at the bottom of the cell and is tapped off for further processing.
+
+Like steelworks, aluminum smelters are operated night and day without interruption. Aside from metallurgical issues the
+large thermal mass and enormous heating power requirements do not permit power cycling. Due to the high costs of
+production inefficiencies or interruptions the behavior of aluminum smelters under power outages is a
+well-characterized phenomenon in the industry. The recent move away from nuclear power and towards renewable energy has
+lead to an increase in fluctuations of electricity price throughout the day. These electricity price fluctuations have
+provided enough economic incentive to aluminum smelters to develop techniques to modulate smelter power consumption
+without affecting cell lifetime or product quality\cite{duessel01,eisma01}. Power outages of tens of minutes up to two
+hours reportedly do not cause problems in aluminum potlines and are in fact part of routine operation for purposes such
+as electrode changes\cite{eisma01,oye01}.
+
+The power supply system of an aluminum plant is managed through a highly-integrated control system as keeping all cells
+of a potline under optimal operating conditions is challenging. Modern power supply systems employ large banks of diodes
+or SCRs\footnote{SCRs, also called thyristors, are electronic devices that are often used in high-power switching
+applications. They are normally-off devices that act like diodes when a current is fed into their control terminal.} to
+rectify low-voltage AC to DC to be fed into the potline\cite{ayoub01}. The potline voltage can be controlled almost
+continuously through a combination of a tap changer and a transductor. The individual cell voltages can be controlled by
+changing the anode to cathode distance (ACD) by physically lowering or raising the anode. The potline power supply is
+connected to the high voltage input and to the potline through isolators and breakers.
+
+In an aluminum smelter most of the power is sunk into resistive losses and the electrolysis process. As such an
+aluminum smelter does not have any significant electromechanical inertia compared to the large rotating machines used
+in other industries. Depending on the capabilities of the rectifier controls high slew rates are possible, permitting
+modulation at high\footnote{Aluminum smelter rectifiers are \emph{pulse rectifiers}. This means instead of simply
+rectifying the incoming three-phase voltage they use a special configuration of transformer secondaries and in some
+cases additional coils to produce a large number of equally spaced phases (e.g.\ six) from a standard three-phase input.
+Where a direct-connected three-phase rectifier would draw current in six pulses per mains voltage cycle a pulse
+rectifier draws current in more, smaller pulses to increase power factor. For example a 12-pulse rectifier will draw
+current in 12 pulses per cycle. In the best case an SCR pulse rectifier switched at zero crossing should allow
+\SIrange{0}{100}{\percent} load changes from one rectifier pulse to the next, i.e. within a fraction of a single cycle.}
+data rates.
+
+\subsubsection{Avoiding dangerous modes}
+
+Modern power systems are complex electromechanical systems. Each component is controlled by several carefully tuned
+feedback loops to ensure voltage, load and frequency regulation. Multiple components are coupled through transmission
+lines that themselves exhibit complex dynamic behavior. The overall system is generally stable, but may exhbit
+instabilities to particular small-signal stimuli\cite{kundur01,crastan03}. These instabilities, called \emph{modes},
+occur when due to mis-tuning of parameters or physical constraints the overall system exhibits oscillation at a
+particular frequency. \cite{kundur01} separates these modes into four categories:
+
+\begin{description}
+ \item[Local modes] where a single power station oscillates in some parameter,
+ \item[Interarea modes] where subsections of the overall grid oscillate with respect to each other due to weak
+ coupling between them,
+ \item[Control modes] caused by imperfectly tuned control systems and
+ \item[Torsional modes] that originate from electromechanical oscillations in the generator itself.
+\end{description}
+
+The oscillation frequencies associated with each of these modes are usually between a few tens of Millihertz and a few
+Hertz\cite{grebe01,entsoe01,crastan03}. It is hard to predict the particular modes of a power system at the scale of the
+central European interconnected system. Theoretical analysis and simulation may give rough indications but cannot yield
+conclusive results. Due to the obvious danger as well as high economical impact due to inefficiencies experimental
+measurements are infeasible. Modes are highly dependent on the power grid's structure and will change with changes in
+the power grid over time. For all of these reasons, a grid frequency modulation system must be designed very
+conservatively without relying on the absence (or presence) of modes at particular frequencies. A concrete design
+guideline that we can derive from this situation is that the frequency spectrum of any grid frequency modulation system
+should not exhibit large peaks and should avoid a concentration of spectral energy in small frequency bands.
+
+\subsubsection{Overall system parameters}
+
+In conclusion we end up with the following tunable parameters for a grid frequency modulation based on a large
+controllable load:
+
+\begin{description}
+ \item[Modulation amplitude.] Amplitude is proportionally related to modulation power. In a practical setup we might
+ realize a modulation power up to a few hundred \si{\mega\watt} which would yield a few tens of \si{\milli\hertz}
+ of frequency amplitude.
+ \item[Modulation preemphasis and slew-rate control.] Preemphasis might be necessary to ensure an adequate
+ Signal-to-Noise ratio (SNR) at the receiver. Slew-rate control and other shaping measures might be necessary to
+ reduce the impact of these sudden load changes on the transmitter's primary function (say, aluminum smelting)
+ and to prevent disturbances to other grid components.
+ \item[Modulation frequency.] For a practical implementation a careful study would be necessary to determine the
+ optimal frequency band for operation. On one hand we need to prevent disturbances to the grid such as the
+ excitation of local or inter-area modes. On the other hand we need to optimize Signal-to-Noise ratio (SNR)
+ and data rate to achieve optimal latency between transmission start and reset completion and to reduce the
+ overall burden on both transmitter and grid.
+ \item[Further modulation parameters.] The modulation itself has numerous parameters that are discussed in Section
+ \ref{mod_params} below.
+\end{description}
+
+\section{From grid frequency to a reliable communication channel}
+Based on the physical properties oulined above we will provide the theoretical groundwork for a practical communication
+system based on grid frequency modulation.
+
+\subsection{Channel properties}
+In this section we will explore how we can construct a reliable communication channel from the analog primitive we
+have outlined in the previous section. Our load control approach to grid frequency modulation leads to a channel with the
+following properties.
+
+\begin{description}
+ \item[Slow-changing.] Accurate grid frequency measurements take several periods of the mains sine wave. Faster
+ sampling rates can be achieved with more complex specialized synchrophasor estimation algorithms but this will
+ result in a trade-off between sampling rate and accuracy\cite{belega01}.
+ \item[Analog.] Grid frequency is an analog signal.
+ \item[Noisy.] While stable over long periods of time thanks to power stations' Load-Frequency Control
+ systems\cite{entsoe04} there are considerable random short-term variations. Our modulation amplitude is limited
+ by technical and economic constraints so we have to find a system that will work at poor SNRs.
+ \item[Polarized.] Grid frequency measurements have an inherent sense of polarity that we can use in our modulation
+ scheme.
+\end{description}
+
+\subsection{Modulation and its parameters}
+\label{mod_params}
+
+In this section we will analyze what makes for a good set of parameters for a modulation scheme fitting grid frequency
+modulation.
+
+As described before the grid's oscillatory modes mean that we should avoid any modulation technique that would
+concentrate energy in a small bandwidth. Taking this principle to its extreme provides us with a useful pointer towards
+techniques that might work well: Spread-spectrum techniques. By employing spread-spectrum modulation we can produce
+close to ideal frequency-domain behavior. Modulation energy is spread almost flatly across the modulation
+bandwidth\cite{goiser01}. At the same time we achieve modulation gain which increases system sensitivity. This
+modulation gain potentially allows us to use a weaker stimulus allowing for a further reduction of the probability of
+disturbance to the overall system. Spread-spectrum techniques also inherently allow us to trade-off receiver sensitivity
+for data rate. This tunability is a useful parameter in the overall system design.
+
+Spread spectrum covers a whole family of techniques that are comprehensively explained in \cite{goiser01}.
+\cite{goiser01} divides spread spectrum techniques into the coarse categories of \emph{Direct Sequence Spread Spectrum},
+\emph{Frequency Hopping Spread Spectrum} and \emph{Time Hopping Spread Spectrum}.
+
+In \cite{goiser01} a BPSK or similar modulation is assumed underlying the spread-spectrum technique. Our grid frequency
+modulation channel effectively behaves more like a DC-coupled wire than a traditional radio channel: Any change in
+excitation will cause a proportional change in the receiver's measurement. Using our FFT-based measurement methodology
+we get a real-valued signed quantity. In this way grid frequency modulation is similar to a channel using coherent
+modulation. We can utilize both signal strength and polarity in our modulation.
+
+For our purposes we can discount both Time and Frequency Hopping Spread Spectrum techniques. Time hopping helps to
+reduce interference between multiple transmitters but does not help with SNR any more than Direct Sequence does since
+all it does is allowing other transmitters to transmit. Our system is strictly limited to a single transmitter so we do
+not gain anything through Time Hopping.
+
+Frequency Hopping Spread Spectrum techniques require a carrier. Grid frequency modulation itself is very limited in
+peak frequency deviation $\Delta f$. Frequency hopping could only be implemented as a second modulation on top of GFM,
+but this would not yield any benefits while increasing system complexity and decreasing data bandwidth.
+
+Direct Sequence Spread Spectrum is the only remaining approach for our application. Direct Sequence Spread Spectrum
+works by directly modulating a long pseudo-random bit sequence onto the channel. The receiver must know the same
+pseudo-random bit sequence and continuously calculates the correlation between the received signal and the pseudo-random
+template sequence mapped from binary $[0, 1]$ to bipolar $[1, -1]$. The pseudo-random sequence has an approximately equal
+number of $0$ and $1$ bits. The positive contribution of the $+1$ terms of the correlation template approximately cancel
+out with the $-1$ terms when multiplied with an uncorrelated signal such as white Gaussian noise.
+
+By using a family of pseudo-random sequences with low cross-correlation channel capacity can be increased. Either the
+transmitter can encode data in the choice of sequence or multiple transmitters can use the same channel at once. The
+longer the pseudo-random sequence, the lower its cross-correlation with noise or other pseudo-random sequences of the
+same length. Choosing a long sequence we increase modulation gain while decreasing bandwidth. For any given application
+the sweet spot will be the shortest sequence that is long enough to yield sufficient SNR for subsequent processing
+layers such as channel coding.
+
+A popular code used in many DSSS systems are Gold codes. A set of Gold codes has small cross-correlations. For some
+value $n$ a set of Gold codes contains $2^n + 1$ sequences of length $2^n - 1$. Gold codes are generated from two
+different maximum length sequences generated by linear feedback shift registers (LFSRs). For any bit count $n$ there are
+certain empirically determined preferred pairs of LFSRs that produce Gold codes with especially good cross-correlation.
+The $2^n + 1$ gold codes are defined as the XOR sum of both LFSR sequences shifted from $0$ to $2^n-1$ bit as well as
+the two individual LFSR sequences. Given LFSR sequences \texttt{a} and \texttt{b} in numpy notation this is
+\mintinline{python}{[a, b] + [ a ^ np.roll(b, shift) for shift in len(b) ]}.
+
+In DSSS modulation the individual bits of the DSSS sequence are called \emph{chips}. Chip duration determines modulation
+bandwidth\cite{goiser01}. In our system we are directly modulating DSSS chips on mains frequency without an underlying
+modulation such as BPSK as it is commonly used in DSSS systems.
+
+\subsection{Error-correcting codes}
+
+To reduce reception error rate we have to layer channel coding on top of the DSSS modulation. The messages we expect to
+transmit are at least a few tens of bits long. We are highly constrained in SNR due to limited transmission power and
+with lower SNR comes higher BER (Bit Error Rate). At a fixed BER, packet error rate grows exponentially with
+transmission length so for our relatively long transmissions we would realistically get unacceptable error rates.
+
+Error correcting codes are a very broad field with many options for specialization. Since we are implementing only an
+advanced prototype in this thesis we chose to spend only limited resources on optimization and settled on a basic
+Reed-Solomon code. We have no doubt that applying a more state-of-the-art code we could gain further improvements in
+code overhead and decoding speed among others\cite{mackay01}. Since message length in our system limits system response
+time but we do not have a fixed target we can tolerate some degree of overhead. Decoding speed is of very low concern
+to us because our data rate is extremely low. We derived our implementation by adapting and optimizing an existing open
+source decoder that we validated on an open source encoder implementation. We generate test signals using a Python tool
+on the host.
+
+\subsection{Cryptographic security}
+\label{sec-crypto}
+Above the communication base layer elaborated in the previous section we have to layer a cryptographic protocol to
+ensure system security. We want to avoid a case where a third party could interfere with our system or even subvert this
+safety system itself for an attack. From a protocol security perspective the system we are looking for can informally
+be modelled as consisting of three parties: the trusted \emph{transmitter}, one of a large number of untrusted
+\emph{receivers}, and an \emph{attacker}. These three play according to the following rules:
+
+\begin{description}
+ \item[Access.] Both transmitter and attacker can transmit any bit sequence.
+ \item[Indistinguishability.] The receiver receives any transmission by either but cannot distinguish between them.
+ \item[Kerckhoff's principle.] Since the protocol design is public and anyone can get access to an electricity meter
+ the attacker knows anything any receiver might know\cite{kerckhoff01,kerckhoff02}.
+ \item[Priority.] The transmitter is stronger than an attacker and will ``win'' during simultaneous transmission.
+ \item[Seeding.] Both transmitter and receiver can be seeded out-of-band with some information on each other such as
+ public key fingerprints.
+\end{description}
+
+We are not considering situations where an attacker attempts to jam an ongoing transmission. In practice there are
+several avenues to prevent such attempts. Compromised large loads that are being abused by the attacker can be manually
+disconnected by the utility. Error-correcting codes can be used to provide resiliency against small-scale disturbances.
+Finally, the transmitter can be designed to have high enough power to be able to override any likely attacker.
+
+With the above properties in mind our goal is to find a cryptographic primitive that has the following properties:
+\begin{description}
+ \item[Authentication.] The transmitter can produce a message bit sequence that a certain subset of receivers can
+ identify as being generated by the transmitter. On reception of this sequence, all addressed receivers perform a
+ safety reset.
+ \item[Unforgeability.] The attacker cannot forge a message, i.e.\ find a bit sequence other than one of the
+ transmitter's previous messages that a receiver would accept. This implies that the attacker also cannot create
+ a new distinct message from a previously transmitted message.
+ \item[Brevity.] The message should be short. Our communication channel is outrageously slow compared to anything
+ else used in modern telecommunications and every bit counts.
+\end{description}
+
+On a protocol level we also have to ensure \emph{idempotence}. Our system should have an at-most-once semantic. This
+means for a given message each receiver either performs exactly one safety reset or none at all, even if the message is
+re-transmitted by either the transmitter or an attacker. We cannot achieve the ideal exactly-once semantic wit pure
+protocol gymnastics since we are using an unidirectional lossy communication primitive. A receiver might be offline
+(e.g.\ due to a local power outage) and then would not hear the transmission even if our broadcast primitive was
+reliable. Since there is no back channel, the transmitter has no way of telling when that happens. The practical impact
+of this can be mitigated by the transmitter repeating the message a number of times.
+
+It follows from the unforgeability requirement that we can trivially reach idempotence at the protocol level by keeping
+a database of all previous messages and only accepting new messages. By considering this in our cryptographic design we
+can reduce the storage overhead of this ``database''.
+
+Along with the indistinguishability property the access requirement implies that we need a cryptographic
+signature\cite{lamport01}. However, we have relaxed constraints on this signature compared to standard cryptographic
+practice\cite{anderson04}. While cryptographic signatures need to work over arbitrary inputs, all we want to ``sign''
+here is the instruction to perform a safety reset. This is the only message we might ever want to transmit so our
+message space has only one element. The information content of our message thus is 0 bit! All the information we want to
+transmit is already encoded \emph{in the fact that we are transmitting} and we do not require a further payload to be
+transmitted: We can omit the entirety of the message and just transmit whatever ``signature'' we
+produce\cite{haller01,rfc1760}. This is useful to conserve transmission bits so our transmission does not take an
+exceedingly long time over our extremely slow communication channel.
+
+We can modify this construction to allow for a small number of bits of information content in our message (say two or
+three instead of zero) at no transmission overhead by transmitting the cryptographic signature as usual but simply
+omitting the message. The message contains only a few bits of information and we are dealing with minutes of
+transmission time so the receiver can reconstruct the message through brute-force. Though this trade-off between
+computation and data transmission might seem inelegant it does work for our extremely slow link for up to a few bits of
+information.
+
+There is an important limitation in the rules of our setup above: The attacker can always record the reset bit sequence
+the transmitter transmits and replay that same sequence later. Even without cryptography we can trivially prevent an
+attacker from violating the at-most-once criterion. If every receiver memorizes all bit sequences that have been
+transmitted so far it can detect replays. With this mitigation by replaying an older authentic transmission an attacker
+can cause receivers that were offline during the original transmission to reset at a later point. Considering our goal
+is to reset them in the first place this should not pose a threat to the system's safety or security.
+
+A possible scenario would be that an attacker first causes enough havoc for authorities to trigger a safety reset. The
+attacker would record the trigger transmission. We can assume most meters were reset during the attack. Due to this the
+attacker cannot cause a significant number of additional resets immediately afterwards. However, the attacker could
+wait several years for a number of new meters to be installed that might not yet have updated firmware that includes the
+last transmission. This means the attacker could cause them to reset by replaying the original sequence.
+
+A possible mitigation for this risk would be to introduce one bit of information into the trigger message that is
+ignored by the replay protection mechanism. This \emph{enable} bit would be $1$ for the actual reset trigger message.
+After the attack the transmitter would then perform scheduled transmissions of a ``disarm'' message that has this bit
+set to $0$. This message informs all new meters and meters that were offline during the original transmission of the
+original transmission for replay protection without actually performing any further resets.
+
+We could use any of several traditional asymmetric cryptographic primitives to produce these signatures. The
+comparatively high computational effort required for signature verification would not be an issue. Transmissions take
+several minutes anyway and we can afford to spend some tens of seconds even in signature verification. Transmission
+length and by proxy system latency would be determined by the length of the signature. For RSA signature length is the
+modulus length (i.e. larger than \SI{1000}{bit} for very basic contemporary security). For elliptic curve-based systems
+curve length is approximately twice the security level and signature size is twice the curve length because two curve
+points need to be encoded\cite{anderson02}. For contemporary security this results in more than 300 bit transmission
+length. We can exploit our unique setting's low message entropy to improve on this by basing our scheme on a
+cryptographic hash function used as a one-way pseudo-random function (PRF). Hash-based signature schemes date back to
+the very beginnings of cryptographic signatures\cite{anderson04,diffie01,lamport02}. Today, in general applications
+schemes based on asymmetric cryptography are preferred but hash-based signature systems have their applications in
+certain use cases. One example of such a scheme is the TESLA scheme\cite{perrig01} that is the basis for navigation
+message authentication in the European Galileo global navigation satellite system. Here, a system based purely on
+asymmetric primitives would result in too much computation and communication overhead\cite{ec05}. In the following
+sections we will introduce the foundations of hash-based signatures before deriving our authentication scheme.
+
+\subsubsection{Lamport signatures}
+
+1979, Lamport in \cite{lamport02} introduced a signature scheme that is based only on a one-way function such as a
+cryptographic hash function. The basic observation is that by choosing a random secret input to a one-way function and
+publishing the output, one can later prove knowledge of the input simply by publishing it. In the following paragraphs
+we will describe a construction of a one-time signature scheme based on this observation. The scheme we describe is the
+one usually called a ``Lamport Signature'' in modern literature but is slightly different from the variant described in
+the 1979 paper. For our purposes we can consider both to be equivalent.
+
+\paragraph{Setup.} In a Lamport signature, for an n-bit hash function $H$ the signer generates a private key $s =
+\left(s_{b, i} | b\in\left\{0, 1\right\}, 0\le i<n\right)$ of $2n$ random strings of length $n$. The signer publishes a
+public key $p = \left(p_{b, i} = H\left(s_{b, i}\right), b\in\left\{0, 1\right\}, 0\le i<n\right)$ that is simply the
+list of hashes of each of the random strings that make up the private key.
+
+\paragraph{Signing.} To sign a message $m$, the signer publishes the signature $\sigma = \left(\sigma_i = k_{H(m)_i,
+i}\right)$ where $H(m)_i$ is the $i$-th bit of $H$ applied to $m$. That is, for the $i$-th bit of the message's hash
+$H(m)$ the signer publishes either of $p_{0, i}$ or $p_{1, i}$ depending on the hash bit's value, keeping the other
+entry of $P$ secret.
+
+\paragraph{Verification.} The verifier can compute $H(m)$ themselves and check the corresponding entries $\sigma_i =
+k_{H(m)_i}$ of $S$ correctly evaluate to $p_{b, i} = H\left(s_{b, i}\right)$ from $P$ under $H$.
+
+The above scheme is a one-time signature scheme only. After one signature has been published for a given key, the
+corresponding key must not be reüsed for other signatures. This is intuitively clear as we are effectively publishing
+part of the private key as the signature, and if we were to publish a signature for another message an attacker could
+derive additional signatures by ``mixing'' the two published signatures.
+
+\subsubsection{Winternitz signatures}
+
+An improvement to basic Lamport signatures as described above are Winternitz signatures as detailed in
+\cite{merkle01,dods01}. Winternitz signatures reduce public key length as well as signature length for hash length $n$
+from $2n$ to $\mathcal O \left(n/t\right)$ for some choice of parameter $t$ (usually a small number such as 4).
+
+\paragraph{Setup.} The signer generates a private key $s = \left(s_i\right)$ consisting of $\ceil{\frac{n}{t}}$ random
+bit strings. The signer publishes a public key $p = \left(H^{2^t}\left(s_i\right)\right)$ where each element
+$H^{2^t}\left(s_i\right)$ is the $2^t$-fold recursive application of $H$ to $s_i$.
+
+\paragraph{Signing.} The signer splits $m$ padded to a multiple of $t$ bits into $\ceil{\frac{n}{t}}$ chunks $m_i$ of
+$t$ bit each. The signer publishes the signature $\sigma = \left( \sigma_i = H^{m_i}\left(s_i\right) \right)$.
+
+\paragraph{Verification.} The verifier can calculate for each $\sigma_i = H^{m_i}\left(s_i\right)$ that $H^{2^t -
+m_i}\left(\sigma_i\right) = H^{2^t - m_i}\left(H^{m_i}\left(s_i\right)\right) = H^{2^t - m_i + m_i} \left(s_i\right) =
+p_i$.
+
+To prevent an attacker from forging additional signatures from one signature by calculating $\sigma_i' =
+H\left(\sigma_i\right)$ matching $m_i' = m_i + 1$, this scheme is usually paired with a simple checksum as described in
+\cite{merkle01}.
+
+\subsubsection{Using hash-based signatures for trigger authentication}
+
+Applying these concepts the most basic trigger authentication scheme possible would be to simply generate a random
+secret key bit string $s$ and publish $p = H(s)$ for some hash function $H$. To activate the trigger, $\sigma = s$ is
+published and receivers verify that $H(\sigma) = p = H(s)$. This simplistic scheme has one main disadvantage: It is a
+fundamentally one-time construction. To prevent an attacker from re-triggering a receiver a second time by replaying a
+valid trigger $\sigma$ all receivers have to blacklist any ``used'' $\sigma$. Alas, this means we can only ever trigger
+a receiver \emph{once}. The good part is that any receiver that missed this trigger can still be triggered later, but
+the bad part is that once $s$ is burned we are out of options. The trivial solution to this would be to simply provision
+each receiver with a whole list of public keys in advance. This however takes $n$ times the amount of space for $n$-fold
+retriggerability and for each one we have to memorize separately whether it has been used up. Luckily we can easily
+derive a scheme that yields $n$-fold retriggerability and naturally memorizes replay state while using no more space
+than the original scheme by taking some inspiration from Winternitz signatures.
+
+In this improved scheme the secret key $s$ is still a random bit string. The public key is $p = H^n(s)$ for $n$-times
+retriggerability. The $i$-th time the trigger is activated, $\sigma_i = H^{n-i}(s)$ is published, and every receiver
+can verify that $\sigma_{i-1} = H\left(\sigma_i\right)$ with $\sigma_0 = p$. In case a receiver missed one or more
+previous triggers it continues computing $H\left(H\left(\sigma_i\right)\right)$ and
+$H\left(H\left(H\left(\sigma_i\right)\right)\right)$ and so on until either reaching the $n$-th recursion
+level--indicating an invalid signature--or finding $H^n\left(\sigma_i\right) = \sigma_j$ with $\sigma_j$ being the last
+signature this receiver recorded or $p$ in case there is none.
+
+This scheme provides replay protection since the receiver memorizes the last signature they acted on. Public key length
+is equal to the length of the hash function $H$ used. Even for our embedded systems use case $n$ can realistically be up
+to $\mathcal O\left(10^3\right)$, which is enough for our purposes. This use of a hash chain for event authentication is
+identical to the one in the S/KEY one-time password system\cite{anderson04,haller01,rfc1760}.
+% 1990ies crypto yeah!
+
+The ``disarm'' message we discussed above for replay protection can be integrated into this scheme by encoding the
+``enable'' bit into the least significant bit of $n$ in our $H^n$ construction. In the chain of valid signatures every
+second one would be a disarm signature: Reset and disarm signatures would alternate in this scheme. By skipping a disarm
+signature two resets can still be triggered directly after one another.
+
+In practice it may be useful to have some control over which meters reset. An attack exploiting a particular network
+protocol implementation flaw might only affect one series of meters made by one manufacturer. Resetting \emph{all}
+meters may be too much in this case. A simple solution for this is to define addressable subsets of meters. ``All
+meters'' along with ``meters made by manufacturer $x$'' and ``meters of model $y$'' are good choices for such scopes. On
+the cryptographic level the protocol state is simply duplicated for each scope. This incurs memory and computation
+overhead linear in the number of scopes but device memory requirements are small at a few bytes only and computation is
+of no concern due to the very slow channel so this simple solution is adequate. The transmitter has to either store
+copies of all scope's keys or derive these keys from a root key using the scope's identifier. Keys are small and the
+transmitter would be using a regular server or hardware security module for key management so either easily feasible.
+
+A diagram of the key structure in this key management scheme is shown in Figure \ref{fig:sig_key_chain}. The
+transmitter key management is shown in Figure \ref{fig:tx_scope_key_illu}. This scheme is simplistic but suffices for
+our prototype in Section \ref{sec-prototype} and may even be useful in a practical implementation. During
+standardization of a safety reset system the key management system would most likely have to be customized to the
+particular application's requirements. Developing an universal solution is outside the scope of this work.
+
+\begin{figure}
+ \centering
+ \begin{minipage}[c]{0.5\textwidth}
+ \includegraphics{resources/signature_key_chain}
+ \end{minipage}
+ \begin{minipage}[c]{0.45\textwidth}
+ \caption{
+ The hash chain between secret transmitter key and public device key. Each step represents one invocation of the
+ hash function. To generate a new chain a random transmitter key is generated, then hashed $n$ times to
+ generate the corresponding device key. A new trigger message can be generated by generating the key at depth
+ $m-1$ where $m$ is the height of the last used trigger, or $n$ initially. Every second trigger message is a
+ disarm message and every second one a reset message. Depending on which is needed either one may be skipped.
+ }
+ \label{fig:sig_key_chain}
+ \end{minipage}
+\end{figure}
+
+\begin{figure}
+ \centering
+ \includegraphics[width=\textwidth]{resources/transmitter_scope_key_illustration}
+ \caption{
+ An illustration of a key management system using a common master key. First, the transmitter derives one secret
+ key for each addressable group from the master key. Then public device keys are generated like in Figure
+ \ref{fig:sig_key_chain}. Finally for each device the manufacturer picks the group public keys matching the
+ device. In this example one device is a series A meter made by manufacturer B so it gets provisioned with the
+ keys for the ``all devices'', ``manufacturer B'' and ``series A'' groups. The other device is also made by
+ manufacturer B but is a series C device so it gets provisioned with the ``all devices'', ``manufacturer B'' and
+ ``series C'' device keys. In this example the transmitter stores (or is able to derive) all six shown
+ group keys, but each device only needs to store the three applying to it--one for each of the three scopes ``all
+ devices'', ``manufacturer'' and ``series''.
+ }
+ \label{fig:tx_scope_key_illu}
+\end{figure}
+
+\chapter{Practical implementation}
+
+To validate the practical feasibility of the theoretical concepts we laid out in the previous chapter we decided to
+build a prototype of a safety reset controller. In this section we describe the reasoning behind the components of this
+prototype and the engineering that went into its firmware. The prototype consists of a smart meter whose application
+microcontroller is reset by a microcontroller on an external circuit board. We lay out how we extensively
+tested all parts of our firmware implementation. We conclude with results of a practical end-to-end experiment
+exercising every part of our prototype.
+
+\section{Data collection for channel validation}
+
+To design a solid system we needed to parametrize mains frequency variations under normal conditions. To set modulation
+amplitude as well as parameters of our modulation scheme we need a frequency spectrum of mains frequency variations
+(that is $\mathcal F\left(f(V(t))\right)$: Taking mains frequency $f(x)$ as a variable, the frequency spectrum of that
+variable, as opposed to the frequency spectrum of mains voltage $V(t)$ itself).
+
+\subsection{Grid frequency estimation}
+\label{frequency_estimation}
+
+In commercial power systems Phasor Measurement Units (PMUs, also called \emph{synchrophasors}) are used to precisely
+measure parameters of the mains voltage waveform, one of which is grid frequency. PMUs are used as part of SCADA systems
+controlling transmission networks to characterize the operational state of the network.
+
+From a superficial viewpoint measuring grid frequency might seem like a simple problem. Take the mains voltage waveform,
+measure time between two rising-edge (or falling-edge) zero-crossings and take the inverse $f = t^{-1}$. In practice,
+phasor measurement units are significantly more complex than this. This discrepancy is due to the combination of both
+high precision and quick response that is demanded from these units. High precision is necessary since variations of
+mains frequency under normal operating conditions are quite small--in the range of \SIrange{5}{10}{\milli\hertz} over
+short intervals of time. Relative to the nominal \SI{50}{\hertz} this is a derivation of less than \SI{100}{ppm}.
+Relative to the corresponding period of \SI{20}{\milli\second} this means a time derivation of about $2 \mu\text{s}$
+from cycle to cycle. From this it is already obvious why a simplistic measurement cannot yield the required precision
+for manageable averaging times: We would need either an ADC sampling rate in the order of megabits per second or for a
+reconstruction through interpolated readings an impractically high ADC resolution.
+
+Detail on the inner workings of commercial phasor measurement units is scarce but given their essential role to SCADA
+systems there is a large amount of academic research on such algorithms\cite{narduzzi01,derviskadic01,belega01}. A
+popular approach to these systems is to perform a Short-Time Fourier Transform (STFT) on ADC data sampled at high
+sampling rate (e.g. \SI{10}{\kilo\hertz}) and then perform analysis on the frequency-domain data to precisely locate the
+peak at \SI{50}{\hertz}. A key observation here is that FFT bin size is going to be much larger than required frequency
+resolution. This fundamental limitation follows from the Nyquist criterion\cite{shannon01}
+and if we had to process an \emph{arbitrary} signal this would severely limit our practical measurement accuracy
+\footnote{
+ Some software packages providing FFT or STFT primitives such as scipy\cite{virtanen01} allow the user to
+ super-sample FFT output by specifying an FFT width larger than input data length, padding the input data with zeros
+ on both sides. Note that in line with the Nyquist theorem this \emph{does not} actually provide finer output
+ resolution but instead just amounts to an interpolation between output bins. Depending on the downstream analysis
+ algorithm it may still be sensible to use this property of the DFT for interpolation, but in general it will be
+ computationally expensive compared to other interpolation methods and in any case it will not yield any better
+ frequency resolution aside from a potential numerical advantage\cite{gasior02}.
+}.
+For this reason all approaches to grid frequency estimation are based on a model of the voltage waveform. Nominally
+this waveform is a perfect sine at $f=\SI{50}{\hertz}$. In practice it is a sine at $f\approx\SI{50}{\hertz}$
+superimposed with some aperiodic noise (e.g. irregular spikes from inductive loads being energized) as well as harmonic
+distortion that is caused by topologically nearby devices with power factor $\cos \theta \neq 1.0$. Under a continuous
+fourier transform over a long period the frequency spectrum of a signal distorted like this will be a low noise floor
+depending mainly on aperiodic noise on which a comb of harmonics as well as some sub-harmonics of $f \approx
+f_\text{nom} = \SI{50}{\hertz}$ is riding. The main peak at $f \approx f_\text{nom}$ will be very strong with the
+harmonics being approximately an order of magnitude weaker in energy and the noise floor being at least another order of
+magnitude weaker. See Figure \ref{mains_voltage_spectrum} for a measured spectrum. This domain knowledge about the
+expected frequency spectrum of the signal can be employed in a number of interpolation techniques to reconstruct the
+precise frequency of the spectrum's main component despite distortions and the comparatively coarse STFT resolution.
+
+Published grid frequency estimation algorithms such as \cite{narduzzi01,derviskadic01} are rather sophisticated and use
+a combination of techniques to reduce numerical errors in FFT calculation and peak fitting. Given that we do not need
+reference standard-grade accuracy for our application we chose to start with a very basic algorithm instead. We chose to
+use a general approach to estimate the precise fundamental frequency of an arbitrary signal that was published by
+experimental physicists Gasior and Gonzalez at CERN\cite{gasior01}. This approach assumes a general sinusoidal signal
+superimposed with harmonics and broadband noise. Applicable to a wide spectrum of practical signal analysis tasks it is
+a reasonable first-degree approximation of the much more sophisticated estimation algorithms developed specifically for
+power systems. Some algorithms use components such as kalman filters\cite{narduzzi01} that require a physical model.
+As a general algorithm \cite{gasior01} does not require this kind of application-specific tuning, eliminating one source
+of error.
+
+The Gasior and Gonzalez algorithm\cite{gasior01} passes the windowed input signal through a DFT, then interpolates the
+signal's fundamental frequency by fitting a wavelet such as a Gaussian to the largest peak in the DFT results. The bias
+parameter of this curve fit is an accurate estimation of the signal's fundamental frequency. This algorithm is similar
+to the simpler interpolated DFT algorithm used as a reference in much of the synchrophasor estimation
+literature\cite{borkowski01}. The three-term variant of the maximum side lobe decay window often used there is a
+Blackman window with parameter $\alpha = \frac{1}{4}$. Analysis has shown\cite{belega01} that the interpolated DFT
+algorithm is worse than algorithms involving more complex models under some conditions but that there is \emph{no free
+lunch} meaning that more complex perform worse when the input signal deviates from their models.
+
+\subsection{Frequency sensor hardware design}
+
+\label{sec-fsensor}
+Our safety reset controller will have to measure mains frequency to later demodulate a reset signal transmitted through
+it. Since we have decided to do our own frequency measurement system here we can reüse this frequency measurement setup
+as a prototype for the frequency measurement component of the demodulation system we will develop later. Since we do
+not plan to do a large-scale field deployment of our measurement setup we can keep the hardware implementation simple by
+moving most of the signal processing to a regular computer and concentrating our hardware efforts on raw signal capture.
+
+\begin{figure}
+ \begin{center}
+ \begin{tikzpicture}[start chain = going below, node distance = 12mm and 50mm, every join/.style = {norm}]
+ \tikzset{
+ base/.style = {draw, on chain, on grid, align=center, minimum height = 4ex, font=\footnotesize},
+ text/.style = {base},
+ component/.style = {base, rectangle, text width=40mm},
+ coord/.style = {coordinate, on chain, on grid, node distance=6mm and 25mm}
+ }
+ \node[text centered] (input) {Single phase mains input};
+ \node[component] (safety) [below = of input] {Input protection};
+ \node[coord] (safety-anchor) [below = of safety] {};
+ \node[component] (analog) [below = of safety-anchor] {Analog signal processing};
+ \node[component] (powersupply) [left = of analog] {Power supply};
+ \node[component] (adc) [below = of analog] {ADC};
+ \node[component] (micro) [below = of adc] {Microcontroller};
+ \node[component] (isol) [below = of micro] {Galvanic digital isolation};
+ \node[coord] (isol-left) [left = 6cm of isol.west] {};
+ \node[coord] (isol-right) [right = 1cm of isol.east] {};
+ \node[component] (usb) [below = of isol] {USB interface};
+
+ \draw[->] (input.south) -- (safety.north);
+ \draw[-] (safety.south) -- (safety-anchor);
+ \draw[->] (safety-anchor) -| (powersupply.north);
+ \draw[->] (safety-anchor) -| (analog.north);
+ \draw[->] (powersupply.south) |- (adc.west);
+ \draw[->] (powersupply.south) |- (micro.west);
+ \draw[->] (analog.south) -- (adc.north);
+ \draw[->] (adc.south) -- (micro.north);
+ \draw[->] (micro.south) -- (isol.north);
+ \draw[->] (isol.south) -- (usb.north);
+
+ \draw[dashed] (isol.west) -- (isol-left.east);
+ \draw[dashed] (isol.east) -- (isol-right.west);
+ \end{tikzpicture}
+ \end{center}
+ \caption{Frequency sensor hardware block diagram.}
+ \label{fmeas-sens-diag}
+\end{figure}
+
+An overall block diagram of our system is shown in Figure \ref{fmeas-sens-diag}. The microcontroller we chose is an
+\texttt{STM32F030F4P6} ARM Cortex M0 microcontroller made by ST Microelectronics. The ADC in Figure
+\ref{fmeas-sens-diag} in our implementation is the integrated 12-bit ADC of this microcontroller, which is sufficient
+for our purposes. The USB interface is a simple USB to serial converter IC (\texttt{CH340G}) and the galvanic digital
+isolation is accomplished with a pair of high speed optocouplers on its \texttt{RX} and \texttt{TX} lines. The analog
+signal processing is a simple voltage divider using high power resistors to get the required creepage along with some
+high frequency filter capacitors and an op-amp buffer. The power supply is an off-the-shelf mains-input power module.
+The system is implemented on a single two-layer PCB that is housed in an off-the-shelf industrial plastic case fitted
+with a printed label and a few status lights on its front. The schematics of our system can be found in Appendix
+\ref{sec-app-freq-sens-schematics}.
+
+\subsection{Clock accuracy considerations}
+
+Our measurement hardware will sample line voltage at some sampling rate $f_S$, e.g.\ \SI{1}{\kilo\hertz}. All downstream
+processing is limited in accuracy by the accuracy of $f_S$\footnote{
+We are not considering the effect of clock jitter. We are highly oversampling the signal and the FFT done in our
+downstream processing will average out small jitter effects leaving only frequency stability to worry about. }. We
+generate our sampling clock in hardware by clocking the ADC from one of the microcontroller's timer blocks clocked from
+the microcontroller's system clock. This means our ADC's sampling window will be synchronized cycle-accurate to the
+microcontroller's system clock.
+
+Our downstream estimation of mains frequency by nature is relative to our sampling frequency $f_S$. In the setup
+described above this means we have to make sure our system clock is stable. A frequency deviation of \SI{1}{ppm} in our
+system clock causes a proportional grid frequency measurement error of $\Delta f = f_\text{nom} \cdot 10^{-6} =
+\SI{50}{\micro\hertz}$. In a worst-case scenario where our system is clocked from a particularly bad crystal that
+exhibits \SI{100}{ppm} of instabilities over our measurement period we end up with an error of \SI{5}{\milli\hertz}.
+This is well within our target measurement range, so we need a more stable clock source. Ideally we want to avoid
+writing our own clock conditioning code where we try to change an oscillators operating frequency to match some
+reference. Clock conditioning algorithms are complex\cite{ti01} and in our case post processing of measurement data and
+simply adding an offset is simpler and less error-prone.
+
+Our solution to these problems is to use a crystal oven\footnote{
+ A crystal oven is a crystal oscillator closely thermally coupled to a heater and temperature sensor and enclosed in
+ a thermally isolated case. The heater is controlled to hold the crystal oscillator at a near constant temperature
+ some tens of degrees Celsius above ambient temperature. Ambient temperature variations will be absorbed by the
+ temperature control. This yields a crystal frequency that is almost completely unaffected by ambient temperature
+ variations below the oven temperature and whose main remaining instability is aging.
+}as our main system clock source. Crystal ovens are expensive compared to ordinary crystal oscillators. Since any
+crystal oven will be much more accurate than a standard room-temperature crystal we chose to reduce cost by using one
+recycled from old telecommunications equipment.
+
+To verify clock accuracy we routed an externally accessible SMA connector to a microcontroller pin that is routed to one
+of the microcontroller's timer inputs. By connecting a GPS 1pps signal to this pin and measuring its period we can
+calculate our system's Allan variance\footnote{
+ Allan variance is a measure of frequency stability between two clocks.
+}, thereby measuring both clock stability and clock accuracy.
+We ran a 4 hour test of our frequency sensor that generated the histogram shown in Figure \ref{ocxo_freq_stability}.
+These results show that while we get a systematic error of about \SI{10}{ppm} due to manufacturing tolerances the
+random error at less than \SI{10}{ppb} is smaller than that of a room-temperature crystal oscillator by 3-4 orders of
+magnitude. Since we are interested in grid frequency variations over time but not in the absolute value of grid
+frequency the systematic error is of no consequence to us. The random error at \SI{3.66}{ppb} corresponds to a
+frequency measurement error of about \SI{0.2}{\micro\hertz}, well below what we can achieve at reasonable sampling rates
+and ADC resolution.
+
+\begin{figure}
+ \centering
+ \includegraphics{../lab-windows/fig_out/ocxo_freq_stability}
+ \caption{OCXO Frequency derivation from its nominal \SI{19.440}{\mega\hertz} frequency measured against a GPS
+ receiver's 1pps reference output.}
+ \label{ocxo_freq_stability}
+\end{figure}
+
+\subsection{Firmware implementation}
+
+The firmware uses one of the microcontroller's timers clocked from an external crystal oscillator to produce an
+\SI{1}{\milli\second} tick that the internal ADC is triggered from for a sample rate of \SI{1}{\kilo sps}. Higher sample
+rates would be possible but reliable data transmission over the opto-isolated serial interface might prove challenging
+and \SI{1}{\kilo sps} already corresponds to $20$ samples per cycle at $f_\text{nominal}$. This figure exceeds the
+Nyquist criterion by a factor of ten and is plenty for accurate measurements.
+
+The ADC measurements are read using DMA and written into a circular buffer. Using DMA controller features this
+circular buffer is split in back and front halves with one being written to and the other being read at the same time.
+Buffer contents are moved from the ADC DMA buffer into a packet-based reliable UART interface as they come in. The UART
+packet interface keeps two ring buffers: One byte-based ring buffer for transmission data and one ring buffer pointer
+structure that keeps track of ADC data packet boundaries in the byte-based ring buffer. Every time a chunk of data is
+available from the ADC the data is framed into the byte-based ring buffer and the packet boundaries are logged in the
+packet pointer ring buffer. If the UART transmitter is idle at this time a DMA-backed transmission of the oldest packet
+in the packet ring buffer is triggered at this point. Data is framed using Consistent Overhead Byte Stuffing
+(COBS)\footnote{
+COBS is a framing technique that allows encoding $n$ bytes of arbitrary data into exactly $n+1$ bytes with no embedded
+$0$ bytes that can then be delimited using $0$ bytes. COBS is simple to implement and allows both one pass decoding and
+encoding. The encoder either needs to be able to read up to \SI{256}{\byte} ahead or needs a buffer of \SI{256}{\byte}.
+COBS is very robust in that it allows self-synchronization. At any point a receiver can reliably synchronize itself
+against a COBS data stream by waiting for the next $0$ byte. The constant overhead allows precise bandwidth and buffer
+planning and provides constant, good efficiency close to the theoretical maximum.}\cite{cheshire01} along with a
+CRC-32 checksum for error checking. When the host receives a new packet with a valid checksum it returns an
+acknowledgement packet to the sensor. When the sensor receives the acknowledgement, the acknowledged packet is dropped
+from the transmission packet ring buffer. When the host detects an incorrect checksum it simply stays quiet and waits for
+the sensor to resume with retransmission when the next ADC buffer has been received.
+
+The serial interface logic presents most of the complexity of the sensor firmware. This complexity is necessary since
+we need reliable, error-checked transmission to the host. Though rare, bit errors on a serial interface do happen and
+data corruption is unacceptable. The packet layer queueing on the sensor is necessary since the host is not a realtime
+system and unpredictable latency spikes of several hundred milliseconds are possible.
+
+The host in our recording setup is a Raspberry Pi 3 model B running a Python script. The Python script handles serial
+communication and logs data and errors into an SQLite database file. SQLite has been chosen for its simple yet flexible
+interface and its good tolerance of system resets due to unexpected power loss. Overall our setup performed adequately
+with IO contention on the Raspberry PI/Linux side causing only 16 skipped sample packets over a 68 hour recording span.
+
+\subsection{Frequency sensor measurement results}
+
+\begin{figure}
+ \centering
+ \begin{minipage}[c]{0.48\textwidth}
+ \includegraphics{resources/grid_meas_device_front.jpg}
+ \end{minipage}
+ \begin{minipage}[c]{0.48\textwidth}
+ \includegraphics{resources/grid_meas_device_open.jpg}
+ \end{minipage}
+ \vspace*{3mm}
+ \caption{
+ The finished grid frequency sensor device. The large yellow part on the bottom left is the crystal oven. The
+ large black part is the power supply module. The microcontroller is on the bottom right of the device and the
+ measurement circuit is in its middle. The device connects to the data recording computer via galvanically
+ isolated USB on the bottom and to a regular wall socket through the IEC connector on the top of the device.
+ }
+ \label{pic_freq_sensor}
+\end{figure}
+
+Our completed frequency sensor can be seen in Figure \ref{pic_freq_sensor}. The raw voltage waveform data we captured
+with it has been processed in the Jupyter Lab environment\cite{kluyver01} and grid frequency estimates are extracted as
+described in Section \ref{frequency_estimation} using the Gasior and Gonzalez\cite{gasior01} technique. The Jupyter
+notebook we used for frequency measurement is included with the supplementary materials to this thesis. In Figure
+\ref{freq_meas_feedback} we fed back to the frequency estimator its own output giving us an indication of its numerical
+performance. The result was \SI{1.3}{\milli\hertz} of RMS noise over a \SI{3600}{\second} simulation time. This
+indicates performance is good enough for our purposes. In addition to this we validated our algorithm's performance by
+applying it to the test waveforms from \cite{wright01}. In this test we got errors of \SI{4.4}{\milli\hertz} for the
+\emph{noise} test waveform, \SI{0.027}{\milli\hertz} for the \emph{interharmonics} test waveform and
+\SI{46}{\milli\hertz} for the \emph{amplitude and phase step} test waveform. Full results can be found in Figure
+\ref{freq_meas_rocof_reference}.
+
+Figures \ref{freq_meas_trace} and \ref{freq_meas_trace_mag} show our measurement results over a 24-hour and a 2-hour
+window respectively.
+
+\begin{figure}
+ \centering
+ \includegraphics[width=\textwidth]{../lab-windows/fig_out/freq_meas_feedback}
+ \caption{
+ The frequency estimation algorithm applied to a synthetic noise-less mains waveform generated from its own
+ output. This feedback simulation gives an indication of numerical errors in our estimation algorithm. The top
+ four graphs show a comparison of the original trace (blue) and the re-calculated trace (orange). The bottom
+ trace shows the difference between the two. As we can tell both traces agree very well with an overall RMS
+ deviation of about \SI{1.3}{\milli\hertz}. The bottom trace shows deviation growing over time. This is an effect
+ of numerical errors in our ad hoc waveform generator.
+ }
+ \label{freq_meas_feedback}
+\end{figure}
+
+\begin{figure}
+ \centering
+ \includegraphics[width=\textwidth]{../lab-windows/fig_out/freq_meas_rocof_reference}
+ \caption{
+ Performance of our frequency estimation algorithm under the test suite specified in \cite{wright01}. Shown are
+ standard deviation and variance measurements as well as time-domain traces of absolute differences.
+ }
+ \label{freq_meas_rocof_reference}
+\end{figure}
+
+\begin{figure}
+ \centering
+ \includegraphics[width=\textwidth]{../lab-windows/fig_out/freq_meas_trace_24h}
+ \caption{Trace of grid frequency over a 24 hour time span. One clearly visible feature are large positive and negative
+ transients at full hours. Times shown are UTC. Note that the European continental synchronous area that this
+ sensor is placed in covers several time zones which may result in images of daily load peaks appearing in 1 hour
+ intervals. Figure \ref{freq_meas_trace_mag} contains two magnified intervals from this plot.}
+ \label{freq_meas_trace}
+\end{figure}
+
+\begin{figure}
+ \begin{subfigure}{\textwidth}
+ \centering
+ \includegraphics[width=\textwidth]{../lab-windows/fig_out/freq_meas_trace_2h_1}
+ \caption{A 2 hour window centered on 00:00 UTC.}
+ \end{subfigure}
+ \begin{subfigure}{\textwidth}
+ \centering
+ \includegraphics[width=\textwidth]{../lab-windows/fig_out/freq_meas_trace_2h_2}
+ \caption{A 2 hour window centered on 18:30 UTC.}
+ \end{subfigure}
+ \caption{Two magnified 2 hour windows of the trace from Figure \ref{freq_meas_trace}.}
+ \label{freq_meas_trace_mag}
+\end{figure}
+
+\begin{figure}
+ \centering
+ \includegraphics[width=\textwidth]{../lab-windows/fig_out/mains_voltage_spectrum}
+ \caption{Power spectral density of the mains voltage trace in Figure \ref{freq_meas_trace}. Data was captured using
+ our frequency measurement sensor (\ref{sec-fsensor}) and FFT-processed after applying a Blackman window. The
+ vertical lines indicate \SI{50}{\hertz} and odd harmonics. We can see the expected peak at \SI{50}{\hertz} along
+ with smaller peaks at odd harmonics. We can also see a number of spurious tones both between harmonics and at low
+ frequencies. We can also see bands containing high noise energy around \SI{0.1}{\hertz}. This graph shows a high
+ signal-to-noise ratio that is not very demanding on our frequency estimation algorithm.
+ }
+ \label{mains_voltage_spectrum}
+\end{figure}
+
+\section{Channel simulation and parameter validation}
+\label{sec-ch-sim}
+
+To validate all layers of our communication stack from modulation scheme to cryptography we built a prototype
+implementation in Python. Implementing all components in a high level language builds up familiarity with the concepts
+while taking away much of the implementation complexity. For our demonstrator we will not be able to use Python since
+our target platform is an inexpensive low-end microcontroller. Our demonstrator firmware will have to be written in a
+low-level language such as C or Rust. For prototyping these languages lack flexibility compared to Python.
+
+To validate our modulation scheme we first performed a series of simulations on our Python demodulator prototype
+implementation. To simulate a modulated grid frequency signal we added noise to a synthetic modulation signal. For most
+simulations we used measured frequency data gathered with our frequency sensor. We only have a limited amount of capture
+data. Re-using segments of this data as background noise in multiple simulation runs could lead to our simulation
+results depending on individual features of this particular capture that would be common between all runs. To estimate
+the impact of this problem we re-ran some of our simulations with artificial random noise synthesized with a power
+spectral density matching that of our capture. To do this, we first measured our capture's PSD, then fitted a
+low-resolution spline to the PSD curve in log-log coördinates. We then generated white noise, multiplied the resampled
+spline with the DFT of the synthetic noise and performed an iDFT on the result. The resulting time-domain signal is our
+synthetic grid frequency data. Figure \ref{freq_meas_spectrum} shows the PSD of our measured grid frequency signal. The
+red line indicates the low-resolution log-log spline interpolation used for shaping our artificial noise. Figure
+\ref{simulated_noise_spectrum} shows the PSD of our simulated signal overlaid with the same spline as a red line and
+shows time-domain traces of both simulated (blue) and reference signals (orange) at various time scales. Visually both
+signals look very similar, suggesting that we have found a good synthetic approximation of our measurements.
+
+\begin{figure}
+ \centering
+ \hspace*{-1.2cm}\includegraphics[width=1.2\textwidth]{../lab-windows/fig_out/freq_meas_spectrum}
+ \caption{Power spectral density of the 24 hour grid frequency trace in Figure \ref{freq_meas_trace} with some notable
+ peaks annotated with the corresponding period in seconds. The $\frac{1}{f}$ line indicates a pink noise spectrum.
+ Around a period of \SI{20}{\second} the PSD starts to fall off at about $\frac{1}{f^3}$ until we can make out some
+ bumps at periods around $2$ and \SI{3}{\second}. Starting at at around \SI{1}{Hz} we can see a white noise floor in
+ the order of \si{\micro\hertz^2\per\hertz}.
+ % TODO: where does this noise floor come from? Is it a fundamental property of the grid? Is it due to limitations of
+ % our measurement setup (such as ocxo stability/phase noise) ???
+ }
+ \label{freq_meas_spectrum}
+\end{figure}
+
+\begin{figure}
+ \centering
+ \hspace*{-1.2cm}
+ \includegraphics[width=1.2\textwidth]{../lab-windows/fig_out/simulated_noise_spectrum}
+ \caption{Synthetic grid frequency in comparison with measured data. The topmost graph shows the synthetic spectrum
+ compared to the spline approximation of the measured spectrum (red line). The other graphs show time-domain
+ synthetic data (blue) in comparison with measured data (orange).
+ }
+ \label{simulated_noise_spectrum}
+\end{figure}
+
+In our simulations, we manipulated four main variables of our modulation scheme and demodulation algorithm and observed
+their impact on symbol error rate (SER):
+
+\begin{description}
+ \item[Modulation amplitude.] Higher amplitude corresponds to a lower SER.
+ \item[Modulation bit count.] Higher bit count $n$ means longer transmissions but yields higher theoretical decoding
+ gain, and should increase demodulator sensitivity. Ultimately, we want to find a sweet spot of manageable
+ transmission length at good demodulator sensitivity.
+ \item[Decimation or DSSS chip duration.] The chip time determines where in the grid frequency spectrum (Figure
+ \ref{freq_meas_spectrum}) our modulated signal is located. Given our noise spectrum (Figure
+ \ref{freq_meas_spectrum}) lower chip durations (shifting our signal upwards in the spectrum) should yield lower
+ in-band background noise which should correspond to lower symbol error rates.
+ \item[Demodulation correlator peak threshold factor.] The first step of our prototype demodulation algorithm is to
+ calculate the correlation between all $2^n+1$ Gold sequences and our signal and to identify peaks corresponding
+ to the input data containing a correctly aligned Gold sequence. The threshold factor determines peaks of which
+ magnitude compared to baseline noise levels are considered in the following maximum likelihood estimation (MLE)
+ decoding (cf.\ Figure \ref{fig_demo_sig_schema}).
+\end{description}
+
+Our results indicate that symbol error rate is a good proxy of demodulation performance. With decreasing signal-to-noise
+ratio, margins in various parts of the demodulator decrease which statistically leads to an increased symbol error rate.
+Our simulations yield smooth, reproducible SER curves with adequately low error bounds. This shows SER is related
+monotonically to the signal-to-noise margins inside our demodulator prototype.
+
+\subsection{Sensitivity as a function of sequence length}
+
+A basic parameter of our DSSS modulation is the length of the Gold codes used. The length of a Gold code is exponential
+in the code's bit count. Figure \ref{dsss_gold_nbits_overview} shows a plot of the symbol error rate of our demodulator
+prototype depending on amplitude for each of five, six, seven and eight bit Gold sequences. In regions where symbol
+error rate is neither clipping at $0$ nor at $1$ we can see the expected dependency that a $n+1$ bit Gold sequence at
+roughly twice the length yields roughly one half the SER. We can also observe a saturation effect: At low amplitudes,
+increasing the correlation length does not yield much benefit in SER anymore. In particular at a signal amplitude of
+\SI{2.5}{\milli\hertz} even with asymptotically infinite sequence length our demodulator would still not be able to
+produce a good demodulation. This is likely due to numerical errors in our demodulator. Since Gold codes of more than 7
+bit would yield unacceptably long transmission times this does not pose a problem in practice.
+
+Figure \ref{dsss_gold_nbits_sensitivity} for each bit count shows the minimum signal amplitude at which our demodulator
+crossed below $\text{SER}=0.5$. If we have sufficient transmitter power to allocate selecting either a 5 bit or a 6 bit
+Gold code yields sufficient performance at manageable data rates.
+
+\begin{figure}
+ \centering
+ \includegraphics[width=0.6\textwidth]{../lab-windows/fig_out/dsss_gold_nbits_overview}
+ \caption{
+ Symbol Error Rate (SER) as a function of transmission amplitude. The line represents the mean of several
+ measurements for each parameter set. The shaded areas indicate one standard deviation from the mean. Background
+ noise for each trial is a random segment of measured grid frequency. Background noise amplitude is the same for
+ all trials. Shown are four traces for four different DSSS sequence lengths. Using a 5-bit gold code, one DSSS
+ symbol measures 31 chips. 6 bit per symbol are 63 chips, 7 bit are 127 chips and 8 bit 255 chips. This
+ simulation uses a decimation of 10, which corresponds to an $1 \text{s}$ chip length at our $10 \text{Hz}$ grid
+ frequency sampling rate. At 5 bit per symbol, one symbol takes $31 \text{s}$ and one bit takes $6.2 \text{s}$
+ amortized. At 8 bit one symbol takes $255 \text{s} = 4 \text{min} 15 \text{s}$ and one bit takes $31.9 \text{s}$
+ amortized. Here, slower transmission speed buys coding gain. All else being equal this allows for a decrease
+ in transmission power.
+ }
+ \label{dsss_gold_nbits_overview}
+\end{figure}
+
+\begin{figure}
+ \centering
+ \begin{minipage}[c]{0.5\textwidth}
+ \hspace*{-1cm}\includegraphics[width=1.1\textwidth]{../lab-windows/fig_out/dsss_gold_nbits_sensitivity}
+ \end{minipage}\begin{minipage}[c]{0.45\textwidth}
+ \caption{
+ Amplitude at an SER of 0.5\ in mHz depending on symbol length. Here we can observe an increase of sensitivity
+ with increasing symbol length, but we can clearly see diminishing returns above 6 bit (63 chips). Considering
+ that each bit roughly doubles overall transmission time for a given data length it seems lower bit counts are
+ preferrable if the required transmitter power can be realized.
+ }
+ \label{dsss_gold_nbits_sensitivity}
+ \end{minipage}
+\end{figure}
+
+\subsection{Sensitivity versus peak detection threshold factor}
+
+One of the high level parameters of our demodulation algorithm is the \emph{threshold factor}. This parameter is
+an implementation detail specific to our algorithm and not general to all possible DSSS demodulation algorithms. After
+correlating the input signal against the template Gold sequences our algorithm runs a single channel discrete wavelet
+transform (DWT) on the correlator output to better discriminate peaks from background noise. The output of this DWT is
+then normalized against a running average and then fed into a simple threshold detector. The threshold of this detector
+is our threshold factor. This threshold is the ratio that a correlation peak after DWT has to stand out from long-term
+average background noise to be considered a peak.
+
+The threshold factor is an empirically determined unitless parameter. Low threshold factors yield many false positives
+that in the extreme ultimately overload our MLE estimator's capacity to discard them. Moderate numbers of false
+positives do not pose much of a challenge to our MLE since these spurious peaks have a random time distribution and are
+easily discarded by our MLE's detection of sequences of equally-spaced symbols. High threshold factors lead the
+algorithm to completely ignore some valid peaks. To some degree this can be compensated by our later interpolation step
+for missing peaks but in the extreme will also break demodulation. In our simulations good values lie in the range from
+$4.0$ to $5.5$.
+
+Figure \ref{dsss_thf_amplitude_5678} contains plots of demodulator sensitivity like the one in Figure
+\ref{dsss_gold_nbits_overview}. This time there is one color-coded trace for each threshold factor between $1.5$ and
+$10.0$ in steps of $0.5$. We can see a clear dependency of demodulation performance from threshold factor with both very
+low and very high values breaking the demodulator. The runaway traces that we can see at low threshold factors are
+artifacts of an implementation issue with our prototype code. We later fixed this issue in the demonstrator firmware
+in Section \ref{sec-demo-fw-impl}. For comparison purposes this issue do not matter.
+
+\begin{figure}
+ \centering
+ \hspace*{-1cm}\includegraphics[width=1.2\textwidth]{../lab-windows/fig_out/dsss_thf_amplitude_5678}
+ \caption{
+ SER vs.\ amplitude graph similar to Figure \ref{dsss_gold_nbits_overview} with one color-coded traces for
+ threshold factors between $1.5$ and $10.0$. Each graph shows traces for a single DSSS symbol length.
+ }
+ \label{dsss_thf_amplitude_5678}
+\end{figure}
+
+If we again look at the intercept points where the amplitude traces cross $\text{SER}=0.5$ in these graphs we get the
+plots in Figure \ref{dsss_thf_sensitivity_all_bits}. From this we can conclude that the range between $4.0$ and $5.0$ will
+yield adequate threshold factors for our use case.
+
+\begin{figure}
+ \centering
+ \hspace*{-1cm}\includegraphics[width=1.1\textwidth]{../lab-windows/fig_out/dsss_thf_sensitivity_5678}
+ \caption{
+ Graphs of amplitude at $SER=0.5$ for each symbol length as well as asymptotic SER for large amplitudes. Areas
+ shaded red indicate that $SER=0.5$ was not reached for any amplitude in the simulated range. The bumps in the 7
+ bit and 8 bit graphs are due to the convergence problem we identified above and do not exist in our demonstrator
+ implementation. We see that smaller symbol lengths favor lower threshold factors, and that optimal threshold
+ factors for all symbol lengths are between $4.0$ and $5.0$.
+ }
+ \label{dsss_thf_sensitivity_all_bits}
+\end{figure}
+
+\subsection{Chip duration and bandwidth}
+
+A parameter of any DSSS system is the frequency band used for transmission. Instead of specifying absolute frequencies
+in our simulations we expressed DSSS bandwidth through chip duration and Gold sequence length. In our prototype, chip
+duration is specified in grid frequency sampling periods to ease implementation without loss of generalization.
+
+Figure \ref{chip_duration_sensitivity} shows the dependence of symbol error rate at a fixed good threshold factor from
+chip duration. The color bars indicate both chip duration translated to seconds real-time and the resulting symbol
+duration at the given Gold code length. In the lower graphs we show the trace of amplitude at $\text{SER}=0.5$ over chip
+duration like we did in Figure \ref{dsss_thf_sensitivity_all_bits} for threshold factor. In both graphs we can see a
+faint optimum for very short chips with a decrease of sensitivity for long chips. This effect is due to longer chips
+moving the signal band into noisier spectral regions (cf.\ Figure \ref{freq_meas_spectrum}).
+
+\begin{FPfigure}
+ \begin{subfigure}{\textwidth}
+ \centering
+ \hspace*{-1cm}\includegraphics[width=1.2\textwidth]{../lab-windows/fig_out/chip_duration_sensitivity_5}
+ \vspace*{-1cm}
+ \label{chip_duration_sensitivity_5}
+ \caption{
+ 5 bit Gold code.
+ }
+ \end{subfigure}
+%\end{figure}
+%\begin{figure}
+% \ContinuedFloat
+ \begin{subfigure}{\textwidth}
+ \centering
+ \hspace*{-1cm}\includegraphics[width=1.2\textwidth]{../lab-windows/fig_out/chip_duration_sensitivity_6}
+ \vspace*{-1cm}
+ \label{chip_duration_sensitivity_6}
+ \caption{
+ 6 bit Gold code.
+ }
+ \end{subfigure}
+ \caption{
+ Dependence of demodulator sensitivity on DSSS chip duration. Due to computational constraints this simulation is
+ limited to 5 bit and 6 bit DSSS sequences. There is a clearly visible sensitivity maximum at short chip
+ lengths around $0.2 \text{s}$. Short chip durations shift the entire transmission band up in frequency. In
+ Figure \ref{freq_meas_spectrum} we can see that noise energy is mostly concentrated at lower frequencies, so
+ shifting our signal up in frequency will reduce the amount of noise the decoder sees behind the correlator by
+ shifting the band of interest into a lower-noise spectral region. For a practical implementation chip duration
+ is limited by physical factors such as the maximum modulation slew rate ($\frac{\text{d}P}{\text{d}t}$) that can
+ be technically realized and the maximum Rate-Of-Change-Of-Frequency (ROCOF, $\frac{\text{d}f}{\text{d}t}$) that
+ the grid can tolerate.
+ }
+ \label{chip_duration_sensitivity}
+\end{FPfigure}
+
+In the previous graphs we have used random clips of measured grid frequency noise as noise in our simulations. Comparing
+between a simulation using measured noise and synthetic noise generated as we outlined in the beginning of Section
+\ref{sec-ch-sim} we get the plots in Figure \ref{chip_duration_sensitivity_cmp}. We can see that while not perfect our
+simulated noise is an adequate approximation of reality: Our prototype demodulator shows no significant difference in
+behavior between measured and simulated noise. Simulated noise causes slightly worse performance for long chips. Overall
+the results for both are very close in absolute value.
+
+\begin{FPfigure}
+ \begin{subfigure}{\textwidth}
+ \centering
+ \hspace*{-1cm}\includegraphics[width=1.2\textwidth]{../lab-windows/fig_out/chip_duration_sensitivity_cmp_meas_6}
+ \vspace*{-1cm}
+ \label{chip_duration_sensitivity_cmp_meas_6}
+ \caption{
+ Simulation using baseline frequency data from actual measurements.
+ }
+ \end{subfigure}
+%\end{figure}
+%\begin{figure}
+% \ContinuedFloat
+ \begin{subfigure}{\textwidth}
+ \centering
+ \hspace*{-1cm}\includegraphics[width=1.2\textwidth]{../lab-windows/fig_out/chip_duration_sensitivity_cmp_synth_6}
+ \vspace*{-1cm}
+ \label{chip_duration_sensitivity_cmp_synth_6}
+ \caption{
+ Simulation using synthetic frequency data.
+ }
+ \end{subfigure}
+ \caption{
+ Chip duration/sensitivity simulation results like in Figure \ref{chip_duration_sensitivity} compared between a
+ simulation using measured frequency data like in the previous graphs and one using artificially generated noise.
+ There is little visible difference indicating that we have found a good model of reality in our noise
+ synthesizer, but also that real grid frequency behaves like a frequency-shaped Gaussian noise process.
+ }
+ \label{chip_duration_sensitivity_cmp}
+\end{FPfigure}
+
+\section{Implementation of a demonstrator unit}
+\label{sec-prototype}
+
+To demonstrate the viability of our reset architecture we decided to implement a demonstrator system. In this
+demonstrator we use JTAG to reset part of a commodity smart meter from an externally-connected reset controller. The
+reset controller receives its commands over the grid frequency modulation system we outlined in this thesis. To keep
+implementation cost low the reset controller is fed a simulation of a modulated grid frequency signal through a standard
+\SI{3.5}{\milli\meter} audio jack\footnote{
+ By generously cutting two PCB traces the meter we chose to use can be easily modified to provide galvanic separation
+ between grid and main application microcontroller. With this modification we have to supply power to its main
+ application MCU externally along with the JTAG interface but now the modified meter is electrically safe.
+}. Measurement of actual grid frequency instead would simply require a voltage divider and depending on the setup an
+analog optoisolator.
+
+\subsection{Selecting a smart meter for demonstration purposes}
+\label{sec-easymeter}
+
+\begin{figure}[h!]
+ \centering
+ \begin{subfigure}{\textwidth}
+ \centering
+ \includegraphics[width=0.6\textwidth]{resources/easymeter_board_composite.jpg}
+ \label{easymeter_display_board_composite}
+ \caption{
+ \footnotesize
+ Optical composite image of the display and data logging board in the top of the case. The six pins at the
+ top are the SPI chip-on-glass segment LCD. Of the eight pads on the left six are unused and two carry the
+ auxiliary power supply from the measurement board below. The bottom right section contains the
+ \si{\kilo\watt\hour} impulse LED and the angled IR communication LED. The flying wires
+ connect to the 14-pin JTAG and serial debug header.
+ }
+ \end{subfigure}
+ \begin{subfigure}{\textwidth}
+ \vspace{1cm}
+ \centering
+ \includegraphics[width=0.8\textwidth]{resources/easymeter_baseboard_composite.jpg}
+ \label{easymeter_measurement_board_composite}
+ \caption{
+ \footnotesize
+ Composite microfocus x-ray image of the potted measurement module in the bottom of the case. The ovals on
+ the top left and right are power supply and data jumper connections for external modules such as SMGW
+ interfaces. The bright parts at the bottom are the massive screw terminals with integrated current shunts.
+ The circuitry right of the three independent measurement channels is the power supply circuit for the
+ display board.
+ }
+ \end{subfigure}
+
+ \caption{
+ Composite images of the circuit boards inside the EasyMeter Q3DA1002 smart electricity meter used in our
+ demonstration.
+ }
+ \label{easymeter_composites}
+\end{figure}
+
+\begin{figure}[h!]
+ \centering
+ \begin{subfigure}{0.45\textwidth}
+ \centering
+ \includegraphics[width=\textwidth]{resources/easymeter_baseboard_channel.jpg}
+ \label{easymeter_channel_xray}
+ \caption{Microfocus x-ray of one channel's data acquisition circuit.}
+ \end{subfigure}\hspace*{5mm}
+ \begin{subfigure}{0.45\textwidth}
+ \centering
+ \includegraphics[width=\textwidth]{resources/easymeter_baseboard_powersupply.jpg}
+ \label{easymeter_powersupply_xray}
+ \caption{Microfocus x-ray of the auxiliary power supply.}
+ \end{subfigure}
+
+ \caption{
+ Microfocus x-rays of major sections of the EasyMeter Q3DA1002 measurement board.
+ }
+ \label{easymeter_detail_xrays}
+\end{figure}
+
+For our demonstrator to make sense we wanted to select a realistic reset target. In Germany where this thesis was
+written a standards-compliant setup would consist of a comparatively feature-limited smart meter and a smart meter
+gateway (SMGW) containing all of the complex bidirectional protocol logic such as wireless or landline IP connectivity.
+The realistic target for a setup in this architecture would be the components of an SMGW such as its communication modem
+or main application processor. In the German architecture the smart meter does not even have to have a bi-directional
+data link to the SMGW effectively mitigating any attack vector for remote compromise.
+
+Despite these considerations we still chose to reset the application MCU inside smart meter for two reasons. One is that
+SMGWs are much rarer on the second-hand market. The other is that SMGWs are a particular feature of the German
+standardization landscape and in many other countries functions of an SMGW such as wireless protocol handling are
+integrated into the meter itself (see e.g.\ \cite{honeywell01}).
+
+In the end we settled on a Q3DA1002 three phase 60A meter made by German manufacturer EasyMeter. This meter is typical
+of what would be found in an average German household and can be acquired very inexpensively as new old stock on online
+marketplaces.
+
+The meter consists of a plastic enclosure with a transparent polycarbonate top part and a gray ABS bottom part that are
+ultrasonically welded together. In the bottom part of the case a PCB we call the \emph{measurement} board is potted in
+epoxide resin (see Figure \ref{easymeter_composites}). This PCB contains three separate energy measurement ASICs for the
+three phases (see Figure \ref{easymeter_detail_xrays}). It also contains a capacitive dropper power supply for the meter
+circuitry and external modules such as a SMGW. The measurement board through three infrared links (one per phase)
+communicates with a smaller unpotted PCB we call the \emph{display} board in the top of the case. This PCB handles
+measurement logging and aggregation, controls a small segment LCD displaying totals and handles the externally
+accessible \si{\kilo\watt\hour} impulse LED and serial IR links.
+
+The measurement board does not contain any logging or outside communication interfaces. All of that is handled on the
+display board by a Texas Instruments \texttt{MSP430F2350} application MCU. This is a 16-bit RISC MCU with
+\SI{16}{\kilo\byte} flash and \SI{2}{\kilo\byte} SRAM\footnote{
+ At first glance the microcontroller might seem overkill for such a simple application, but most of its
+ \SI{16}{\kilo\byte} program flash is in fact used. A casual glance with Ghidra shows that a large part of program
+ flash is expended on keeping multiple redundant copies of energy consumption aggregates including error recovery in
+ case of data corruption and some effort has even been made to guard against data corruption using simple
+ non-cryptographic checksums. Another large part of the MCU's firmware handles data transmission over the meter's
+ externally accessible IR link through Smart Message Language\cite{bsi-tr-03109-1-IVb}.
+}. There is an I2C EEPROM that is used in conjunction with the microcontroller's internal \SI{256}{\byte} data flash to
+keep redundant copies of energy consumption aggregates. On the side of the display board there is a 14-pin header
+containing both a standard TI MSP430 JTAG pinout and a UART serial interface for debugging. Conveniently, the JTAG port
+was left enabled by fuse in our particular production unit.
+
+We chose to use this \texttt{MSP430} series application MCU as our reset target. Though in this particular unit remote
+compromise is impossible due to a lack of bidirectional communication links some of its sister models do contain
+bidirectional communication links\cite{easymeter01} making compromise through communication interfaces an at least
+theoretical possibility. In other countries, meters with a similar architecture to the Q3DA1002 include complex protocol
+logic as part of the meter itself or have bidirectional links to it\cite{honeywell01,ifixit01,bigclive01,eevblog01}. As
+an example, the Honeywell REX2 uses a Maxim Integrated \texttt{71M6541} main application microcontroller along with a
+Texas Instruments \texttt{CC1000} series radio transceiver and is advertised to support both over-the-air firmware
+upgrade and a remotely accessible disconnect switch.
+
+\subsection{Firmware implementation}
+\label{sec-demo-fw-impl}
+
+We based our safety reset demonstrator firmware on the grid frequency sensor firmware we developed in Section
+\ref{sec-fsensor}. We implemented DSSS demodulation by translating the Python prototype code we developed in Section
+\ref{sec-ch-sim} to embedded C code. After validating the C translation in extensive simulations we integrated our code
+with a Reed-Solomon implementation and a libsodium-based implementation of the cryptographic protocol we designed in
+Section \ref{sec-crypto}. To reprogram the target \texttt{MSP430} microcontroller we ported the low-level bitbang JTAG
+driver of \texttt{mspdebug}\footnote{\url{https://github.com/dlbeer/mspdebug}}. See Figure \ref{fig_demo_sig_schema} for
+a schematic overview of signal processing in our demonstrator.
+
+For all computation-heavy high level modules of our firmware such as the DSSS demodulator or the grid frequency
+estimator we wrote test fixtures that allow the same code that runs on the microcontroller to be executed on the host
+for testing. These test fixtures are very simple C programs that load input data from a file or the command line, run
+the algorithm and print results on standard output. To enable automatic testing of a large parameter set we run these
+test fixtures repeatedly from a set of Python scripts sweeping parameters.
+
+\begin{figure}
+ \centering
+ \includegraphics[width=\textwidth]{resources/prototype_schema}
+ \caption{The signal processing chain of our demonstrator.}
+ \label{fig_demo_sig_schema}
+\end{figure}
+
+\section{Grid frequency modulation emulation}
+
+To emulate a modulated grid frequency signal we superimposed a DSSS-modulated signal at the proper amplitude with
+synthetic grid frequency noise generated according to the measurements we took in Section \ref{sec-fsensor}. In this
+primitive simulation we do not simulate the precise impulse response of the grid to a DSSS-modulated stimulus signal.
+Our results still serve to illustrate the possibility of data transmission in this manner this impulse response can be
+compensated for at the transmitter by selecting appropriate modulation parameters (e.g. chip rate and amplitude) and at
+the receiver by equalization with a matched filter.
+
+\section{Experimental results}
+
+\begin{figure}
+ \centering
+ \includegraphics[width=0.6\textwidth]{resources/prototype.jpg}
+ \caption{The completed prototype setup. The board on the left is the safety reset microcontroller. It is connected
+ to the smart meter in the middle through an adapter board. The top left contains a USB hub with debug interfaces to
+ the reset microcontroller. The cables on the bottom left are the debug USB cable and the \SI{3.5}{\milli\meter}
+ audio cable for the simulated mains voltage input.}
+ \label{fig_proto_pic}
+\end{figure}
+
+After extensive simulations and testing of the individual modules of our solution we proceeded to conduct a real-world
+experiment. We tried the demonstrator setup in Figure \ref{fig_proto_pic} using an emulated noisy DSSS signal in
+real-time. Our experiment went without any issues and the firmware implementation correctly reset the demonstrator's
+meter. We were happy to see that our extensive testing paid off: The demonstrator setup worked on its first try.
+
+Our experiment consisted of the demonstrator prototype with the meter flashed with its factory firmware connected to a
+microcontroller development board acting as the safety reset controller. The safety reset controller is connected to a
+laptop's audio output through an adapter board. The laptop plays back an emulated grid voltage waveform that the safety
+reset microcontroller measures and analyzes as it would when directly connected to the mains. When the microcontroller
+receives a reset sequence that is a valid signature using a development key incorporated into its firmware through JTAG
+it re-programs the smart meter with a modified firmware image that displays a success message on the meter's LCD.
+
+We used a signature truncated at 120 bit in our experiment. We chose a 5 bit DSSS sequence. Taking the sign bit into
+account the length of the encoded signature is 20 DSSS symbols. On top of this we used Reed-Solomon error correction at
+a 2:1 ratio inflating total message length to 30 DSSS symbols. At the \SI{1}{\second} chip rate we used in other
+simulations as well this equates to an overall transmission duration of approximately \SI{15}{\minute}. To give the
+demodulator some time to settle and to produce more realistic conditions of signal reception we padded the modulated
+signal unmodulated noise on both ends.
+
+\section{Lessons learned}
+
+Before settling on the commercial smart meter we first tried to use an \texttt{EVM430-F6779} smart meter evaluation kit
+made by Texas Instruments. This evaluation kit did not turn out well for two main reasons. One, it shipped with half the
+case missing and no cover for the terminal blocks. Because of this some work was required to get it electrically safe.
+Even after mounting it in an electrically safe manner the safety reset controller prototype would also have to be
+galvanically isolated to not pose an electrical safety risk since the main MCU is not isolated from the grid and the
+JTAG port is also galvanically coupled. The second issue we ran into was that the \texttt{EVM430-F6779} is based around
+an \texttt{MSP430F6779} microcontroller. This microcontroller is a rather large part within the \texttt{MSP430} series
+and uses a new revision of the CPU core and associated JTAG peripheral that are incompatible with all \texttt{MSP430}
+programmers we tried to use on it. \texttt{mspdebug} does not have support for it and porting TI's own JTAG programmer
+reference sources did not yield any results either. Finally we tried an USB-based programmer made by TI themselves that
+turned out to either have broken firmware or a hardware defect, leading to it frequently reënumerating on the USB.
+
+Overall our initial assumption that a development kit would certainly be easier to program than a commercial meter did
+not prove to be true. Contrary to our expectations the commercial meter had JTAG enabled allowing us to easily read out
+its stock firmware without needing to reverse-engineer vendor firmware update files or circumventing code protection
+measures. The fact that its firmware was only available in its compiled binary form was not much of a hindrance as it
+proved not to be too complex and all we wanted to know could be found out with just a few hours of digging in Ghidra.
+
+In the firmware development phase our approach of testing every module individually (e.g. DSSS demodulator, Reed-Solomon
+decoder, grid frequency estimation) proved to be very useful. In particular debugging benefited greatly from being able
+to run several thousand tests within seconds. In case of our DSSS demodulator this modular testing and simulation
+architecture allowed us to simulate thousands of runs of our implementation on test data and directly compare it to our
+Jupyter/Python prototype (see Figure \ref{fw_proto_comparison}). Since we spent more time polishing our embedded C
+implementation it turned out to perform better than our Python prototype. At the same time it shows fundamentally
+similar response to its parameters. One significant bug we fixed in the embedded C version was the Python version's
+tendency towards incorrect decodings at even very large amplitudes.
+
+\begin{figure}
+ \centering
+ \begin{subfigure}{\textwidth}
+ \centering
+ \hspace*{-1cm}
+ \includegraphics[trim={0 4cm 0 0},clip,width=1.2\textwidth]{../lab-windows/fig_out/dsss_thf_amplitude_56_jupyter_impl}
+ \caption{Python prototype.}
+ \end{subfigure}
+ \begin{subfigure}{\textwidth}
+ \centering
+ \hspace*{-1cm}
+ \includegraphics[trim={0 4cm 0 0},clip,width=1.2\textwidth]{../lab-windows/fig_out/dsss_thf_amplitude_56_fw_impl}
+ \caption{Embedded C implementation.}
+ \end{subfigure}
+
+ \caption{
+ Symbol error rate plots versus threshold factor for both our Python prototype (above) and our firmware
+ implementation of our demodulation algorithm. Note the slightly different threshold factor color scales. Cf.\
+ Figure \ref{dsss_thf_amplitude_5678}.
+ }
+ \label{fw_proto_comparison}
+\end{figure}
+
+In accordance with our initial estimations we did not run into any code space nor computation bottlenecks for chosing
+floating point emulation instead of porting over our algorithms to fixed point calculations. The extremely slow sampling
+rate of our systems makes even heavyweight processing such as FFT or our brute-force dynamic programming approach to
+DSSS demodulation possible well within our performance constraints.
+
+Since we are only building a prototype we did not optimize firmware code size at all. The compiled code size of our
+firmware implementation is slightly larger than we would like at around \SI{64}{\kilo\byte} for our firmware image
+including everything except the target microcontroller firmware image. See appendix \ref{symbol_size_chart} for a graph
+illustrating the contribution of various parts of the signal processing toolchain to this total. Overall the most
+heavy-weight operations by far are the SHA512 implementation from libsodium and the FFT from ARM's CMSIS signal
+processing library. Especially the SHA512 implementation has large potential for size optimization because it is highly
+optimized for speed using extensive manual loop unrolling.
+
+\chapter{Future work}
+
+\section{Precise grid characterization}
+
+We based our simulations on a linear relationship between the generation/consumption power imbalance and grid frequency.
+Our literature study suggests that this is an appropriate first order approximation\cite{crastan03}. We kept the
+modulation bandwidth in our simulations inside a \SIrange{1000}{100}{\milli\hertz} frequency band that we reason is most
+likely to exhibit this linear behavior in practice. At lower frequencies primary control kicks in. With the frequency
+delta thresholds specified for primary control systems\cite{entsoe04} this would lead to significant non-linear
+effects. At higher frequencies grid frequency estimation at the receiver becomes more complex since the margins of the
+FFT transform shrink. Higher frequencies also come close to modes of mechanical oscillation in generators that usually
+lie at \SI{5}{\hertz} and above\cite{crastan03}.
+
+An analysis of the above concerns can be performed using dynamic grid simulation models\cite{semerow01,entsoe05}.
+Presumably out of security concerns these models are only available under non-disclosure agreements. Integrating
+NDA-encumbered results stemming from such a model in an open-source publication such as this one poses a logistical
+challenge which is why we decided to leave this topic for a separate future work.
+
+After detailed model simulation we ultimately aim to validate our results experimentally. Assuming linear grid behavior
+even under very small disturbances a small-scale experiment is an option. Such a small-scale experiment would require
+very long integration times: Given a frequency characteristic of \SI{30}{\giga\watt\per\hertz} a stimulus of
+\SI{10}{\kilo\watt} yields $\Delta f = \SI{0.33}{\micro\hertz}$. At an estimated \SI{20}{\milli\hertz} of RMS noise over
+a bandwidth of interest this results in an SNR slightly better than \SI{-50}{\decibel}. The correlation time necessary
+to offset this with DSSS processing gain at a chip rate of \SI{1}{\baud} would be in the order of days. With such long
+correlation times clock stability starts to become a problem as during correlation transmitter and receiver must
+maintain close phase alignment with respect to one chip period. A phase difference requirement of less than
+\SI{10}{\degree}over this period of time would translate into clock stability better than \SI{10}{ppm}. Though certainly
+not impossible to achieve this does pose an engineering challenge.
+
+A way to reduce clock alignment might be to use grid frequency itself as a reference. Instead of keying the DSSS
+modulator/demodulator on a local crystal oscillator, chip timings would be described in fractions of a mains voltage
+cycle. This would track grid frequency variations synchronously at both ends and would maintain phase alignment even
+over long periods of time at cost of a slight increase in system complexity. The receiver would then measure differences
+between consecutive chips instead of their absolute values.
+
+\section{Technical standardization}
+
+The description of a safety reset system provided in this work could be translated into a formalized technical standard.
+Our system is simple compared to e.g.\ a full smart meter communication standard and thus can conceivably be
+described in a single, concise document. The complicated side of standardization would be the standardization of the
+backend operation including key management, coördination and command authorization.
+
+\section{Regulatory adoption}
+\label{sec-regulation}
+
+Since the proposed system adds significant cost and development overhead at no immediate benefit to either consumer or
+utility company it is unlikely that it would be adopted voluntarily. Market forces limit what long-term planning utility
+companies can do. An advanced mitigation such as this one might be out of their reach on their own and might require
+regulatory intervention to be implemented. To regulatory authorities a system such as this one provides a primitive to
+guard against attacks. Due to the low-level approach our system might allow a regulatory authority to restore meters to
+a safe state without the need of fine-grained control of implementation details such as application network protocols.
+
+A regulatory authority might specify that all smart meters must use a standardized reset controller that on command
+resets to a minimal firmware image that disables external communication, continues basic billing functions and enables
+any disconnect switches. This system would enable the regulatory authority to directly preempt a large-scale attack
+irrespective of implementation details of the various smart meter implementations.
+
+Cryptographic key management for the smart reset system is not much different to the management of highly privileged
+signing keys as they are used in many other systems such as TLS already. If the safety reset system is implemented by a
+regulatory authority they would likely be able to find a public entity that is already managing root keys for other
+government systems to also manage safety reset keys. Availability and security requirements of safety reset keys do not
+differ significantly from those for other types of root keys.
+
+\section{Zones of trust}
+
+In our design, we opted for a safety reset controller in form of a separate micocontroller entirely separate from
+whatever application microcontroller the smart meter design is already using. This design nicely separates the meter
+into an untrusted application on the core microcontroller and the trusted reset controller. Since the interface between
+the two is simple and one-way, it can be validated to a high standard of security.
+
+Despite these security benefits, the cost of such a separate hardware device might prove high in a mass-market rollout.
+In this case, one might attempt to integrate the reset controller into the core microcontroller in some way. Primarily,
+there would be two ways to accomplish this. One is a solution that physically integrates an additional microcontroller
+core into the main application microcontroller package either as a module on the same die or as a separate die in a
+multi-chip module (MCM) with the main application microcontroller. A custom solution integrating both on a single die
+might be a viable path for very large-scale deployments but will most likely be too expensive in tooling costs alone to
+justify its use. More likely for a medium- to large-scale deployment of millions of meters would be a MCM integrating an
+off-the-shelf smart metering microcontroller die with the reset controller running on another, much smaller
+off-the-shelf microcontroller die. This solution might potentially save some cost compared to a solution using a
+discrete microcontroller for the reset controller.
+
+The more likely approach to reducing cost overhead of the reset controller would be to employ virtualization
+technologies such as ARM's TrustZone in order to incorporate the reset controller firmware into the application firmware
+on the same processor core without compromising the reset controller's security or disturbing the application firmware's
+operation.
+
+TrustZone is a virtualization technology that provides a hardware-assisted privileged execution domain. In traditional
+virtualization setups a privileged hypervisor is managing several unprivileged applications that share resources between
+them. Separation between applications in this setup is longitudinal between adjacent virtual machines. Two applications
+would both be running in unprivileged mode sharing the same CPU and the hypervisor would merely schedule them, configure
+hardware resource access and coördinate communication. This longitudinal virtualization simplifies application
+development since from the application's perspective the virtual machine looks very similar to a physical one. In
+addition, in general this setup can be used to reciprocally isolate two applications with neither one being able to gain
+control over the other.
+
+In contrast to this, a TrustZone-like system in general does not provide several application virtual machines and
+longitudinal separation. Instead, it provides lateral separation between two domains: The unprivileged application
+firmware and a privileged hypervisor. Application firmware may communicate with the hypervisor through defined
+interfaces but due to TrustZone's design it need not even be aware of the hypervisor's existence. This makes a perfect
+fit for our reset controller. The reset controller firmware would be running in privileged mode and without exposing any
+communication interfaces to application firmware. The application firmware would be running in unprivileged mode
+without any modification. The main hurdles to the implementation to a system like this are the requirement for a
+microcontroller providing this type of virtualization on the one hand and the complexity of correctly employing this
+virtualization on the other hand. Virtualization systems such as TrustZone are still orders of magnitude more complex to
+correctly configure than it is to simply use separate hardware and secure the interfaces in between.
+
+\chapter{Conclusion}
+
+In this thesis we have developed an end-to-end design of a reset system to restore smart meters to a safe operating
+state during an ongoing large-scale cyberattack. We have laid out the fundamentals of smart metering infrastructure and
+elaborated the need for an out of band method to reset a meter's firmware due to the large attack surface of this
+complex firmware. To allow our system to be triggered even in the middle of a cyberattack we have developed a broadcast
+data transmission system based on intentional modulation of the global grid frequency. We have developed the theoretical
+foundations of the process based on an established model of inertial grid frequency response to load variations and
+shown the viability of our end-to-end design through extensive simulations. To put these simulations on a solid
+foundation we have developed a grid frequency measurement methodology comprising of a custom-designed hardware device
+for electrically safe data capture and a set of software tools to archive and process captured data. Our simulations
+show good behavior of our broadcast communication system and give an indication that coöperating with a large consumer
+such as an aluminum smelter would be a feasible way to set up a transmitter with very low hardware overhead. Based on
+our broadcast primitive we have developed a cryptographic protocol ready for embedded implementation in
+resource-constrained systems that allows triggering all or a selected subset of devices within a quick response time of
+less than 30 minutes. Finally, we have experimentally validated our system using simulated grid frequency data in a
+demonstrator setup based on a commercial microcontroller as our safety reset controller and an off-the-shelf smart
+meter. We have laid out a path for further research and standardization related to our system. Our code and electronics
+designs are available at the public repository listed on the second page of this document.
+
+\newpage
+
+%\nocite{*} TODO: check unused references
+\printbibliography[heading=bibintoc]
+\newpage
+
+\appendix
+
+\chapter{Frequency sensor schematics}
+\label{sec-app-freq-sens-schematics}
+\fancyhead[C]{Frequency sensor schematics (1/3)}
+\fancyfoot[C]{}
+\fancyhead[R]{\thepage}
+\includepdf[fitpaper,landscape,pagecommand={\thispagestyle{fancy}}]{resources/platform-export-pg1.pdf}
+\fancyhead[C]{Frequency sensor schematics (2/3)}
+\includepdf[fitpaper,pagecommand={\thispagestyle{fancy}}]{resources/platform-export-pg2.pdf}
+\fancyhead[C]{Frequency sensor schematics (3/3)}
+\includepdf[fitpaper,landscape,pagecommand={\thispagestyle{fancy}}]{resources/platform-export-pg3.pdf}
+\fancyfoot[C]{\thepage}
+
+\chapter{Demonstrator firmware symbol size map}
+\emph{Please find this appendix enclosed in the pouch on the inside of the back cover.}
+\label{symbol_size_chart}
+\includepdf[fitpaper]{resources/safetyreset-symbol-sizes.pdf}
+
+\ifdefined\includenotebooks
+\chapter{Transcripts of Jupyter notebooks used in this thesis}
+
+\includenotebook{Grid frequency estimation}{grid_freq_estimation}
+\includenotebook{Grid frequency estimation validation against ROCOF test suite}{freq_meas_validation_rocof_testsuite}
+\includenotebook{Frequency sensor clock stability analysis}{gps_clock_jitter_analysis}
+\includenotebook{DSSS modulation experiments}{dsss_experiments-ber}
+\fi
+
+\ifdefined\includefirmwaresources
+\chapter{Firmware source code excerpts}
+\section{DMA-backed ADC capture (adc.c)}
+\inputminted[fontsize=\footnotesize,linenos,firstline=18,lastline=115,breaklines]{C}{../gm_platform/fw/adc.c}
+
+\section{Frequency sensor packetized serial interface}
+\subsection{serial.c}
+\inputminted[fontsize=\footnotesize,linenos,breaklines]{C}{../gm_platform/fw/serial.c}
+\subsection{packet\_interface.c}
+\inputminted[fontsize=\footnotesize,linenos,breaklines]{C}{../gm_platform/fw/packet_interface.c}
+\subsection{cobs.c}
+\inputminted[fontsize=\footnotesize,linenos,breaklines]{C}{../gm_platform/fw/cobs.c}
+\subsection{Host data logging utility (tw\_test.py)}
+\inputminted[fontsize=\footnotesize,linenos,breaklines]{python}{../gm_platform/fw/tw_test.py}
+
+\section{Frequency estimation (freq\_meas.c)}
+\inputminted[fontsize=\footnotesize,linenos,breaklines]{C}{../controller/fw/src/freq_meas.c}
+\section{DSSS demodulation (dsss\_demod.c)}
+\inputminted[fontsize=\footnotesize,linenos,breaklines]{C}{../controller/fw/src/dsss_demod.c}
+\section{Cryptographic protocol handling}
+\subsection{protocol.c}
+\inputminted[fontsize=\footnotesize,linenos,breaklines]{C}{../controller/fw/src/protocol.c}
+\subsection{crypto.c}
+\inputminted[fontsize=\footnotesize,linenos,breaklines]{C}{../controller/fw/src/crypto.c}
+\fi
+
+
+% TODO
+%\chapter{Economic viability of countermeasures}
+%\section{Attack cost}
+%\section{Countermeasure cost}
+%\section{Conclusion}
+
+\end{document}