summaryrefslogtreecommitdiff
path: root/paper/safety-reset-paper.tex
blob: 640f84ae1d8bba8ab45e955dc59a821936b091bb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
\documentclass[runningheads]{llncs}
\usepackage[T1]{fontenc}
\usepackage[
    backend=biber,
    style=numeric,
    natbib=true,
    url=false, 
    doi=true,
    eprint=false
    ]{biblatex}
\addbibresource{safety-reset.bib}
\usepackage{amssymb,amsmath}
\usepackage{eurosym}
\usepackage{wasysym}

\usepackage[binary-units]{siunitx}
\DeclareSIUnit{\baud}{Bd}
\DeclareSIUnit{\year}{a}
\usepackage{commath}
\usepackage{graphicx,color}
\usepackage{subcaption}
\usepackage{array}
\usepackage{hyperref}

\renewcommand{\floatpagefraction}{.8}
\newcommand{\degree}{\ensuremath{^\circ}}
\newcolumntype{P}[1]{>{\centering\arraybackslash}p{#1}}
\newcommand{\partnum}[1]{\texttt{#1}}

\begin{document}

\title{Ripples in a Pond: Transmitting Information through Grid Frequency Modulation}
\titlerunning{Ripples in a Pond: Transmitting Information through Grid Frequency Modulation}
\author{Jan Sebastian Götte \and Björn Scheuermann}
\authorrunning{Jan Sebastian Götte \and Björn Scheuermann}
\institute{HIIG\\ \email{safetyreset@jaseg.de} \and HU Berlin \\ \email{scheuermann@informatik.hu-berlin.de}}
% FIXME keywords
\maketitle
\keywords{Security, privacy and resilience in critical infrastructures \and Security and privacy in ``internet of
things'' \and Cyber-physical systems \and Hardware security \and Network Security \and Energy systems \and Signal theory}

\begin{abstract}
    The smart grid is a large, complex and interconnected technological system. With remotely controllable load switches
    having been rolled out at scale in some countries, a tiny flaw inside the firmware of one of these embedded devices
    may allow attacks to remotely trigger large-scale excursions of grid parameters with potentially catastrophic
    results. Attaining perfect security from such cyberphysical attacks is a monumental embedded engineering task---and
    observations do not indicate that current efforts meet the requirements of this task.%FIXME cite recent RECESSIM work

    In this paper, we approach the smart grid safety issue by implementing an emergency override that can be used to
    e.g.\ reset all connected devices to a known-good state and preempting subsequent compromise by cutting
    communication links. To yield a fully fail-safe design, our system does not rely on the internet or any other
    communication network to work. Instead, our system transmits error-corrected and cryptographically secured commands
    by modulating grid frequency using a single large consumer such as a large aluminium smelter. This approach differs
    from traditional Powerline Communication (PLC) systems in that reaches every device within the same synchronous
    area.

    Using extensive simulations we have determined that control of a $\SI{25}{\mega\watt}$ load would allow for the
    transmission of a crytographically secured \emph{reset} signal within $15$ minutes. We have produced a
    proof-of-concept prototype receiver that demonstrates the feasibility of decoding such signals even on 
    resource-constrained microcontroller hardware.
\end{abstract}

\section{Introduction}

In the power grid, as in many other engineered systems, we can observe an ongoing diffusion of information systems into
industrial control systems. Automation of these control systems has already been practiced for the better part of a
century.  Throughout the 20th century this automation was mostly limited to core components of the grid. Generators in
power stations are computer-controlled according to electromechanical and economic models. Switching in substations is
automated to allow for fast failure recovery. Human operators are still vital to these systems, but their tasks have
shifted from pure operation to engineering, maintenance and surveillance\cite{crastan03,anderson02}.

With the turn of the century came a large-scale trend in power systems to move from a model of centralized generation,
built around massive large-scale fossil and nuclear power plants, towards a more heterogenous model of smaller-scale
generators working together. In this new model large-scale fossil power plants still serve a major role, but two new
factors come into play. One is the advance of renewable energies. The large-scale use of wind and solar power in
particular from a current standpoint seems unavoidable for our continued existence on this planet. For the electrical
grid these systems constitute a significant challenge. Fossil-fueled power plants can be controlled in a precise and
quick way to match energy consumption. This tracking of consumption with production is vital to the stability of the
grid. Renewable energies such as wind and solar power do not provide the same degree of controllability, and they
introduce a larger degree of uncertainty due to the unpredictability of the forces of nature\cite{crastan03}.

Along with this change in dynamic behavior, renewable energies have brought forth the advance of distributed generation.
In distributed generation end-customers that previously only consumed energy have started to feed energy into the grid
from small solar installations on their property. Distributed generation is a chance for customers to gain autonomy and
shift from a purely passive role to being active participants of the electricity market\cite{crastan03}.

To match this new landscape of decentralized generation and unpredictable renewable resources the utility industry has
had to adapt itself in major ways. One aspect of this adaptation that is particularly visible to ordinary people is the
computerization of end-user energy metering. Despite the widespread use of industrial control systems inside the
electrical grid and the far-reaching diffusion of computers into people's everyday lives the energy meter has long been
one of the last remnants of an offline, analog time. Until the 2010s many households were still served through
electromechanical Ferraris-style meters that have their origin in the late 19th
century\cite{borlase01,ukgov04,bnetza02}.  Today under the umbrella term \emph{Smart Metering} the shift towards fully
computerized, often networked meters is well underway. The roll out of these \emph{Smart Meters} has not been very
smooth overall with some countries severely lagging behind. As a safety-critical technology, smart metering technology
is usually standardized on a per-country basis. This leads to an inhomogenous landscape with--in some instances--wildly
incompatible systems.  Often vendors only serve a single country or have separate models of a meter for each country.
This complex standardization landscape and market situation has led to a proliferation of highly complex, custom-coded
microcontroller firmware. The complexity and scale of this--often network-connected--firmware makes for a ripe substrate
for bugs to surface.

A remotely exploitable flaw inside the firmware of a component of a smart metering ystem could have consequences ranging
from impaired billing functionality to an existential threat to grid stability\cite{anderson01,anderson02}. In a country
where meters commonly include disconnect switches for purposes such as prepaid tariffs a coördinated attack could at
worst cause widespread activation of grid safety systems by repeatedly connecting and disconnecting megawatts of load
capacity in just the wrong moments\cite{wu01}.

Mitigation of these attacks through firmware security measures is unlikely to yield satisfactory results. The enormous
complexity of smart meter firmware makes firmware security extremely labor-intensive. The diverse standardization
landscape makes a coördinated, comprehensive response unlikely.

In this paper, instead of focusing on the very hard task of improving firmware security we introduce a pragmatic
solution to the--in our opinion likely--scenario of a large-scale compromise of smart meter firmware. In our proposal
the components of the smart meter that are threatened by remote compromise are equipped with a physically separate
\emph{safety reset controller} that listens for a reset command transmitted through the electrical grid's frequency and
on reception forcibly resets the smart meter's entire firmware to a known-good state.  Our safety reset controller
receives commands through Direct Sequence Spread Spectrum (DSSS) modulation carried out on grid frequency through a
large controllable load such as an aluminum smelter. After forward error correction and cryptographic verification it
re-flashes the meter's main microcontroller over the standard JTAG interface.  Note that our modulation technique is one
\emph{changing grid frequency itself}.  This is fundamentally different in both generation and detection from systems
such as traditional PLC that superimpose a signal on grid voltage, but leave grid frequency itself unaffected.

Starting from a high level architecture, we have carried out extensive simulations of our proposal's performance under
real-world conditions. Based on these simulations we implemented an end-to-end prototype of our proposed safety reset
controller as part of a realistic smart meter demonstrator. Finally, we experimentally validated our results and we will
conclude with an outline of further steps towards a practical implementation.

This work contains the following contributions:
\begin{enumerate}
    \item We introduce Grid Frequency Modulation (GFM) as a communication primitive. % FIXME done before in that one paper
    \item We elaborate the fundamental physics underlying GFM and theorize on the constrains of a practical
        implementation.
    \item We design a communication system based on GFM.
    \item We carry out extensive simulations of our systems to determine its performance characteristics.
    \item We show the simple grid frequency recorder design we used to capture data for our simulations.
\end{enumerate}

\section{Related work} 
\label{sec_related_work}
% FIXME: Cut down this section from ~6 pages to 2...3 pages.

\subsection{Security and Privacy in the Smart Grid}

The smart grid in practice is nothing more or less than an aggregation of embedded control and measurement devices that
are part of a large control system. This implies that all the same security concerns that apply to embedded systems in
general also apply to most components of a smart grid. Where programmers have been struggling for decades now with input
validation\cite{leveson01}, the same potential issue raises security concerns in smart grid scenarios as well\cite{mo01,
lee01}.  Only, in smart grid we have two complicating factors present: Many components are embedded systems, and as such
inherently hard to update. Also, the smart grid and its control algorithms act as a large (partially-)distributed
system making problems such as input validation or authentication harder\cite{blaze01} and adding a host of distributed
systems problems on top\cite{lamport01}.

Given that the electrical grid is essential infrastructure in our modern civilization, these problems amount to
significant issues in practice. Attacks on the electrical grid may have grave consequences\cite{anderson01,lee01} while
the long maintenance cycles of various components make the system slow to adapt. Thus, components for the smart grid
need to be built to a much higher standard of security than most consumer devices to ensure they live up to well-funded
attackers even decades down the road. This requirement intensifies the challenges of embedded security and distributed
systems security among others that are inherent in any modern complex technological system. The safety-critical nature
of the modern smart metering ecosystem in particular was quickly recognized\cite{anderson01}.

A point we will not consider in much depth in this work is theft of electricity. While in publications aimed towards the
general public the introduction of smart metering is always motivated with potential cost savings and ecological
benefits, in industry-internal publications the reduction of electricity theft is often cited as an
incentive\cite{czechowski01}.  Likewise, academic publications tend to either focus on other benefits such as generation
efficiency gains through better forecasting or rationalize the consumer-unfriendly aspects of smart metering with social
benefits\cite{mcdaniel01}. They do not usually point out \emph{revenue protection} mechanisms as
incentives\cite{anderson01,anderson02}.

A serious issue in smart metering setups is customer privacy. Even though the meter ``only'' collects aggregate energy
consumption of a whole household, this data is highly sensitive\cite{markham01}. This counterintuitive fact was
initially overlooked in smart meter deployments leading to outrage, delays and reduced features\cite{cuijpers01}. The
root cause of this problem is that given sufficient timing resolution these aggregate measurements contain ample
entropy. Through disaggregation algorithms individual loads can be identified and through pattern matching even complex
usage patterns can be discerned with alarming accuracy\cite{greveler01}. Similar privacy issues arise in many other
areas of modern life through pervasive tracking and surveillance\cite{zuboff01}.

Another fundamental challenge in smart grid implementations is the central role of smart electricity meters in the smart
grid ecosystem. Smart meters are used both for highly-granular load measurement and (in some countries) load
switching\cite{zheng01}.  Smart electricity meters are effectively consumer devices. They are built down to a certain
price point that is measured by the burden it puts on consumers and that is divided by the relatively small market
served by a single smart meter implementation. Such cost requirements can preclude security features such as the use of
a standard hardened software environment on a high powered embedded system.  Landis+Gyr, a large manufacturer that makes
most of its revenue from utility meters in their 2019 annual report write that they \SI{36}{\percent} of their total
R\&D budget on embedded software (firmware) while spending only \SI{24}{\percent} on hardware
R\&D\cite{landisgyr01,landisgyr02}, indicating a significant tension between firmware security and a smart meter
vendor's bottom line. 

\subsection{The state of the art in embedded security}

Embedded software security generally is much harder than security of higher-level systems. The primary two factors
affecting this are that on one hand, embedded devices usually run highly customized firmware that (often by necessity)
is rarely updated. On the other hand, embedded devices often lack the advanced security mechanisms such as memory
management units that are found in most higher-power devices. Even well-funded companies continue to have trouble
securing their embedded systems. A spectacular example of this difficulty is the recently-exposed flaw in Apple's iPhone
SoC first-stage ROM bootloader that allows for the full compromise of any iPhone before the iPhone X given physical
access to the device. iPhone 8, one of the affected models, was still being manufactured and sold by Apple until April
2020.  In another instance in 2016 researchers found multiple flaws in the secure-world firmware used by Samsung in
their mobile phone SoCs.  The flaws they found were both severe architectural flaws such as secret user input being
passed through untrusted userspace processes without any protection and shocking cryptographic flaws such as
CVE-2016-1919\footnote{\url{http://cve.circl.lu/cve/CVE-2016-1919}}\cite{kanonov01}.  And Samsung is not the only large
multinational corporation having trouble securing their secure world firmware implementation. In 2014 researchers found
an embarrassing integer overflow flaw in the low-level code handling untrusted input in Qualcomm's QSEE
firmware\cite{rosenberg01}. For an overview of ARM TrustZone including a survey of academic work and past security
vulnerabilities of TrustZone-based firmware see \cite{pinto01}.

If even companies targeting R\&D budgets that rival some countries' national budgets at mass-market consumer devices
have trouble securing their secure embedded software stacks, what is a much smaller smart meter manufacturer to do?
Especially if national standards mandate complex protocols such as TLS that are tricky to implement
correctly\cite{georgiev01}, the manufacturer is short on options to secure their product.

\subsection{Attack surface in the smart grid}

From the previous paragraphs we can conclude that in smart metering technology, market incentives do not currently
provide the conditions for a level of device security that will reliably last the coming decades. Considering this
tension, in this paragraph we will outline the cyberphysical risks that arise from attacks on the smart grid in the
first place.

The first such attack that might come to mind is one where the attacker compromises components of the grids centralized
control systems. This type of attack is often cited in popular discourse and to our knowledge is the only type of attack
against a grid that has ever been carried out in practice at scale. Despite their severity, these attacks do not pose a
strictly \emph{scientific} challenge, though since these attacks are generic to any industrial control system. Their
causes and countermeasures are generally well-understood and the hardest challenge in their prevention is likely to lie
in budgetary constraints.

Beyond the centralized control systems, the next target for an attacker may be the communication links between those
control systems and other smart grid components. While in older systems as well as the last mile to households' smart
meters special-purpose systems such as PLC are still common, in the overall system IP-based technologies have
proliferated much like they did in other industries. Along with this adoption of IP-based communication links comes the
ability to apply generic network security measures from the IP world to the smart grid domain. In this way, a
standardized, IP-based protocol stack unlocks decades of network security improvements at little cost. 

Finally, an attacker might target the endpoint device itself. Smart meters are deployed at a large scale
%%% FIXME << HERE WIP >>

\subsection{Cyberphysical threats in the smart grid}

If we model the smart grid as a control system responding to changes in inputs by regulating outputs, on a
very high level we can see two general categories of attacks: Attacks that directly change the state of the outputs, and
attacks that try to influence the outputs indirectly by changing the system's view of its inputs. The former would be an
attack such as shutting down a power plant to decrease generation capacity\cite{lee01}. The latter would be an attack
such as forging grid frequency measurements where they enter a power plant's control systems to provoke the control
systems to
oscillate\cite{kosut01,wu01,kim01}.


\paragraph{Control function exploits.}
Control function exploits are attacks on the mathematical control loops used by the centralized control system. One
example of this type of attack are resonance attacks as described in \cite{wu01}.  In this kind of attack, inputs from
peripheral sensors indicating grid load to the centralized control system are carefully modified to cause a
disproportionately large oscillation in control system action. This type of attack relies on complex resonance effects
that arise when mechanical generators are electrically coupled. These resonances, colloquially called ``modes'', are
well-studied in power system engineering\cite{rogers01,grebe01,entsoe01,crastan03}.  Even disregarding modern attack
scenarios, for stability electrical grids are designed with measures in place to dampen any resonances inherent to grid
structure. These resonances are hard to analyze since they require an accurate grid model and they are unlikely to be
noticed under normal operating conditions.

Mitigation of these attacks can be achieved by ensuring unmodified sensor inputs to the control systems in the first
place. Carefully designing control systems not to exhibit exploitable behavior such as oscillations is also possible but
harder.

\paragraph{Endpoint exploits.}
The one to us rather interesting attack on smart grid systems is someone exploiting the grid's endpoint devices such as
smart electricity meters.  These meters are deployed on a massive scale, with at least one meter per household on
average\footnote{Households rarely share a meter but some households may have a separate meter for detached properties
such as a detached garage or basement.}.  Once compromised, restoration to an uncompromised state can be difficult if it
requires physical access to thousands of devices in hard-to-access locations.

By compromising smart electricity meters, an attacker can forge the distributed energy measurements these devices
perform. In a best-case scenario, this might only affect billing and lead to customers being under- or over-charged if
the attack is not noticed in time. In a less ideal scenario falsified energy measurements reported by these devices
could impede the correct operation of centralized control systems.

In some countries such as the UK smart meters have one additional function that is highly useful to an attacker: They
contain high-current disconnect switches to disconnect the entire household or business in case electricity bills are
left unpaid for a certain period. In countries that use these kinds of systems on a widespread level, the load
disconnect switch is controlled by the smart meter's central microcontroller. This allows anyone compromising this
microcontroller's firmware to actuate the disconnect switch at will.  Given control over a large number of
network-connected smart meters, an attacker might thus be able to cause large-scale disruptions of power
consumption\cite{anderson01,temple01}.  Combined with an attack method such as the resonance attack from \cite{wu01}
that was mentioned above, this scenario poses a serious threat to grid stability.

In places where Demand-Side Management (DSM) is common this functionality may be abused in a similar way. In DSM the
smart metering system directly controls power to certain devices such as heaters. The utility can remotely control the
turn-on and turn-off of these devices to smoothen out the load curve. In exchange the customer is billed a lower price
for the energy consumed by these loads. DSM was traditionally done in a federated fashion usually through low-frequency
PLC over the distribution grid\cite{dzung01}. Smart metering systems no longer require large, resource-intensive
transmitters in substations and bear the potential for a rollout of such technology on a much wider scale than before.
This leads to a potentially significant role of DSM systems in the impact calculation of an attack on a smart metering
system. DSM does not control as much load capacity as remote disconnect switches do but the attacks cited in the above
paragraph still fundamentally apply.

\subsection{Communication Channels on the Grid}

There is a number of well-established technologies for communication on or along power lines. We can distinguish three
basic system categories: Systems using separate wires (such as DSL over landline telephone wiring), wireless radio
systems (such as LTE) and \emph{power line communication} (PLC) systems that reüse the existing mains wiring and
superimpose data transmissions onto the 50 Hz mains sine\cite{gungor01,kabalci01}.

For our scenario, we will ignore short-range communication systems. There exists a large number of \emph{wideband}
power line communication systems that are popular with consumers for bridging Ethernet segments between parts of an
apartment or house.  These systems transmit up to several hundred megabits per second over distances up to several tens
of meters\cite{kabalci01}.  Technologically, these wideband PLC systems are very different from \emph{narrowband}
systems used by utilities for load management among other applications and they are not relevant to our analysis.

\paragraph{Power line communication (PLC).}
In long-distance communications for applications such as load management, PLC systems are attractive since they allow
re-using the existing wiring infrastructure and have been used as early as in the 1930s\cite{hovi01}. Narrowband PLC
systems are a potentially low-cost solution to the problem of transmitting data at small bandwidth over distances of
several hundred meters up to tens of kilometers.

Narrowband PLC systems transmit on the order of Kilobits per second or slower.  A common use of this sort of system are
\emph{ripple control} systems. These systems superimpose a low-frequency signal at some few hundred Hertz carrier
frequency on top of the 50Hz mains sine. This low-frequency signal is used to encode switching commands for
non-essential residential or industrial loads. Ripple control systems provide utilities with the ability to actively
control demand while promising savings in electricity cost to consumers\cite{dzung01}.

In any PLC system there is a strict trade-off between bandwidth, power and distance. Higher bandwidth requires higher
power and reduces maximum transmission distance. Where ripple control systems usually use few transmitters to cover
the entire grid of a regional distribution utility, higher bandwidth bidirectional systems used for automatic meter
reading (AMR) in places such as Italy or France require repeaters within a few hundred meters of a transmitter.

\subsubsection{Landline and wireless IP-based systems.}
Especially in automated meter reading (AMR) infrastructure the cost-benefit trade-off of power line systems does not
always work out for utilities. A common alternative in these systems is to use the public internet for communication.
Using the public internet has the advantage of low initial investment on the part of the utility company as well as
quick commissioning. Disadvantages compared to a PLC system are potentially higher operational costs due to recurring
fees to network providers as well as lower reliability. Being integrated into power grid infrastructure, a PLC system's
failure modes are highly correlated with the overall grid. Put briefly, if the PLC interface is down, there is a good
chance that power is out, too. In contrast general internet services exhibit a multitude of failures that are entirely
uncorrelated to power grid stability.  For purposes such as meter reading for billing purposes, this stability is
sufficient. However for systems that need to hold up in crisis situations such as the recovery system we are
contemplating in this thesis, the public internet may not provide sufficient reliability.

\subsubsection{Short-range wireless systems.}
Smart meters contain copious amounts of firmware but still pale in comparison to the complexity of full-scale computers
such as smartphones. For short-range communication between a meter and a cellular radio gateway mounted nearby or
between a meter and a meter reading operator in a vehicle on the street a protocol such as Wifi (IEEE 802.11) is too
complex. Absent widely-used standards in this space proprietary radio protocols grew attractive. These are often based
on some standardized lower-level protocol such as ZigBee (IEEE 802.15) but entirely home-grown ones also exist. To the
meter manufacturer a proprietary radio protocol has several advantages. It is easy to implement and requires no external
certification. It can be customized to its specific application. In addition it provides vendor lock-in to customers
sharing infrastructure such as a cellular radio gateway between multiple devices.  In other fields a lack of
standardization has led to a proliferation of proprietary protocols and a fragmented protocol landscape. This is a large
problem since the consumer cannot easily integrate products made by different manufacturers into one system. In advanced
metering infrastructure this is unlikely to be a disadvantage since usually there is only one distribution grid
operator for an area.  Shared resources such as a cellular radio gateway would most likely only be shared within a
single building and usually they are all operated by the same provider.

Systems in Europe commonly support Wireless M-Bus, an European standardized protocol\cite{silabs01} that operates on
several ISM bands\footnote{
    Frequency bands that can be used for \emph{Industrial, Scientific and Medical} applications by anyone and that do
    not require obtaining a license for transmitter operation. Manufacturers can use whatever protocol they like on
    these bands as long as they obtain certification that their transmitters obey certain spectral and power
    limitations.
}. ZigBee is another popular standard and some vendors additionally support their own proprietary protcols\footnote{
    For an example see \cite{honeywell01}.
}.

\section{Grid Frequency as a Communication Channel}

Despite the awesome complexity of large power grids the physics underlying their response to changes in load and
generation is surprisingly simple. Individual machines (loads and generators) can be approximated by a small number of
differential equations and the entire grid can be modelled by aggregating these approximations into a large system of
nonlinear differential equations. Evaluating these systems it has been found that in large power grids small signal
steady state changes in generation/consumption power balance cause an approximately linear change in
frequency\cite{kundur01,crastan03,entsoe02,entsoe04}. \emph{Small signal} here describes changes in power balance that
are small compared to overall grid power.  \emph{Steady state} describes changes over a time frame of multiple waveform
cycles as opposed to transient events that only last a few milliseconds.

This approximately linear relationship allows the specification of a coefficient with unit \si{\watt\per\hertz} linking
power differential $\Delta P$ and frequency differential $\Delta f$.  In this thesis we are using the European power
grid as our model system. We are using data provided by ENTSO-E (formerly UCTE), the governing association of European
transmission system operators. In our calculations we use data for the continental European synchronous area, the
largest synchronous area. $\frac{\Delta P}{\Delta f}$, called \emph{Overall Network Power Frequency Characteristic} by
ENTSO-E is around \SI{25}{\giga\watt\per\hertz}.

We can derive general design parameter for any system utilizing grid frequency as a communication channel from the
policies of ENTSO-E\cite{entsoe02,entsoe03}.  Any such system should stay below a modulation amplitude of
\SI{100}{\milli\hertz} which is the threshold defined in the ENTSO-E incidents classification scale for a Scale 0-1
(from ``Anomaly'' to ``Noteworthy Incident'' scale) frequency degradation incident\cite{entsoe02} in the continental
Europe synchronous area.
% FIXME resolve cut --- 

Grid frequency in Europe's synchronous areas is nominally 50 Hertz, but there are small load-dependent variations from
this nominal value. Any device connected to the power grid (or even just within physical proximity of power wiring) can
reliably and accurately measure grid frequency at low hardware overhead. By intentionally modifying grid frequency, we
can create a very low-bandwidth broadcast communication channel. Grid frequency modulation has only ever been proposed
as a communication channel at very small scales in microgrids before\cite{urtasun01} and to our knowledge has not yet
been considered for large-scale application.

Advantages of using grid frequency for communication are low receiver hardware complexity as well as the fact that a
single transmitter can cover an entire synchronous area. Though the transmitter has to be very large and powerful the
setup of a single large transmitter faces lower bureaucratic hurdles than integration of hundreds of smaller ones into
hundreds of local systems that each have autonomous governance.

% FIXME resolve cut ---
\subsection{Interference from Frequency-Coupled Control Systems}

The ENTSO-E Operations Handbook Policy 1 chapter\cite{entsoe02} defines the activation threshold of primary control to
be \SI{20}{\milli\hertz}. Ideally, a modulation system would stay well below this threshold to avoid fighting the
primary control reserve. Modulation line rate should likely be on the order of a few hundred Millibaud.  Modulation at
these rates would outpace primary control action which is specified by ENTSO-E as acting within between ``a few
seconds'' and \SI{15}{\second}.

Keeping modulation amplitude below this threshold would help to avoid spuriously triggering these control functions.
The effective \emph{Network Power Frequency Characteristic} of primary control in the European grid is reported by
ENTSO-E at around \SI{20}{\giga\watt\per\hertz}.   This works out to an upper bound on modulation power of
\SI{20}{\mega\watt\per\milli\hertz}.


\subsection{Transmission Grid Fundamentals for Computer Scientists}
\subsection{Determining Grid Frequency}

% FIXME resolve cut ---
In commercial power systems Phasor Measurement Units (PMUs, also called \emph{synchrophasors}) are used to precisely
measure parameters of the mains voltage waveform, one of which is grid frequency. PMUs are used as part of SCADA systems
controlling transmission networks to characterize the operational state of the network.

From a superficial viewpoint measuring grid frequency might seem like a simple problem. Take the mains voltage waveform,
measure time between two rising-edge (or falling-edge) zero-crossings and take the inverse $f = t^{-1}$. In practice,
phasor measurement units are significantly more complex than this. This discrepancy is due to the combination of both
high precision and quick response that is demanded from these units. High precision is necessary since variations of
mains frequency under normal operating conditions are quite small--in the range of \SIrange{5}{10}{\milli\hertz} over
short intervals of time. Relative to the nominal \SI{50}{\hertz} this is a derivation of less than \SI{100}{ppm}.
Relative to the corresponding period of \SI{20}{\milli\second} this means a time derivation of about $2 \mu\text{s}$
from cycle to cycle. From this it is already obvious why a simplistic measurement cannot yield the required precision
for manageable averaging times: We would need either an ADC sampling rate in the order of megabits per second or for a
reconstruction through interpolated readings an impractically high ADC resolution.

Detail on the inner workings of commercial phasor measurement units is scarce but given their essential role to SCADA
systems there is a large amount of academic research on such algorithms\cite{narduzzi01,derviskadic01,belega01}. A
popular approach to these systems is to perform a Short-Time Fourier Transform (STFT) on ADC data sampled at high
sampling rate (e.g. \SI{10}{\kilo\hertz}) and then perform analysis on the frequency-domain data to precisely locate the
peak at \SI{50}{\hertz}. A key observation here is that FFT bin size is going to be much larger than required frequency
resolution. This fundamental limitation follows from the Nyquist criterion\cite{shannon01}
and if we had to process an \emph{arbitrary} signal this would severely limit our practical measurement accuracy
\footnote{
    Some software packages providing FFT or STFT primitives such as scipy\cite{virtanen01} allow the user to
    super-sample FFT output by specifying an FFT width larger than input data length, padding the input data with zeros
    on both sides. Note that in line with the Nyquist theorem this \emph{does not} actually provide finer output
    resolution but instead just amounts to an interpolation between output bins. Depending on the downstream analysis
    algorithm it may still be sensible to use this property of the DFT for interpolation, but in general it will be
    computationally expensive compared to other interpolation methods and in any case it will not yield any better
    frequency resolution aside from a potential numerical advantage\cite{gasior02}.
}.
For this reason all approaches to grid frequency estimation are based on a model of the voltage waveform.  Nominally
this waveform is a perfect sine at $f=\SI{50}{\hertz}$. In practice it is a sine at $f\approx\SI{50}{\hertz}$
superimposed with some aperiodic noise (e.g. irregular spikes from inductive loads being energized) as well as harmonic
distortion that is caused by topologically nearby devices with power factor $\cos \theta \neq 1.0$. Under a continuous
fourier transform over a long period the frequency spectrum of a signal distorted like this will be a low noise floor
depending mainly on aperiodic noise on which a comb of harmonics as well as some sub-harmonics of $f \approx
f_\text{nom} = \SI{50}{\hertz}$ is riding. The main peak at $f \approx f_\text{nom}$ will be very strong with the
harmonics being approximately an order of magnitude weaker in energy and the noise floor being at least another order of
magnitude weaker. See Figure \ref{mains_voltage_spectrum} for a measured spectrum.  This domain knowledge about the
expected frequency spectrum of the signal can be employed in a number of interpolation techniques to reconstruct the
precise frequency of the spectrum's main component despite distortions and the comparatively coarse STFT resolution.

Published grid frequency estimation algorithms such as \cite{narduzzi01,derviskadic01} are rather sophisticated and use
a combination of techniques to reduce numerical errors in FFT calculation and peak fitting. Given that we do not need
reference standard-grade accuracy for our application we chose to start with a very basic algorithm instead. We chose to
use a general approach to estimate the precise fundamental frequency of an arbitrary signal that was published by
experimental physicists Gasior and Gonzalez at CERN\cite{gasior01}. This approach assumes a general sinusoidal signal
superimposed with harmonics and broadband noise.  Applicable to a wide spectrum of practical signal analysis tasks it is
a reasonable first-degree approximation of the much more sophisticated estimation algorithms developed specifically for
power systems.  Some algorithms use components such as kalman filters\cite{narduzzi01} that require a physical model.
As a general algorithm \cite{gasior01} does not require this kind of application-specific tuning, eliminating one source
of error.

The Gasior and Gonzalez algorithm\cite{gasior01} passes the windowed input signal through a DFT, then interpolates the
signal's fundamental frequency by fitting a wavelet such as a Gaussian to the largest peak in the DFT results. The bias
parameter of this curve fit is an accurate estimation of the signal's fundamental frequency. This algorithm is similar
to the simpler interpolated DFT algorithm used as a reference in much of the synchrophasor estimation
literature\cite{borkowski01}. The three-term variant of the maximum side lobe decay window often used there is a
Blackman window with parameter $\alpha = \frac{1}{4}$. Analysis has shown\cite{belega01} that the interpolated DFT
algorithm is worse than algorithms involving more complex models under some conditions but that there is \emph{no free
lunch} meaning that more complex perform worse when the input signal deviates from their models.
% FIXME resolve cut ---

\subsubsection{Our Algorithm}
\subsubsection{Our Hardware}

\section{Characteristics of Grid Frequency}

\section{Grid Frequency Modulation}
\subsection{Fundamental Physics}
\subsection{Transmitter Implementation}

% FIXME resolve cut ---
In its most basic form a transmitter for grid frequency modulation would be a very large controllable load connected to
the power grid at a suitable vantage point. A spool of wire submerged in a body of cooling liquid such as a small lake
along with a thyristor rectifier bank would likely suffice to perform this function during occasional cybersecurity
incidents.  We can however decrease hardware and maintenance investment even further compared to this rather
uncultivated solution by repurposing regular large industrial loads as transmitters in an emergency situation.  For some
preliminary exploration we went through a list of energy-intensive industries in Europe\cite{ec01}.  The most
electricity-intensive industries in this list are primary aluminum and steel production.  In primary production raw ore
is converted into raw metal for further refinement such as casting, rolling or extrusion.  In steelmaking iron is
smolten in an electric arc furnace. In aluminum smelting aluminum is electrolytically extracted from alumina. Both
processes involve large amounts of electricity with electricity making up \SI{40}{\percent} of production costs. Given
these circumstances a steel mill or aluminum smelter would be good candidates as transmitters in a grid frequency
modulation system.

In aluminum smelting high-voltage mains is transformed, rectified and fed into about 100 series-connected electrolytic
cells forming a \emph{potline}. Inside these pots alumina is dissolved in molten cryolite electrolyte at about
\SI{1000}{\degreeCelsius} and electrolysis is performed using a current of tens or hundreds of Kiloampère. The resulting
pure aluminum settles at the bottom of the cell and is tapped off for further processing.

Like steelworks, aluminum smelters are operated night and day without interruption. Aside from metallurgical issues the
large thermal mass and enormous heating power requirements do not permit power cycling. Due to the high costs of
production inefficiencies or interruptions the behavior of aluminum smelters under power outages is a
well-characterized phenomenon in the industry. The recent move away from nuclear power and towards renewable energy has
lead to an increase in fluctuations of electricity price throughout the day. These electricity price fluctuations have
provided enough economic incentive to aluminum smelters to develop techniques to modulate smelter power consumption
without affecting cell lifetime or product quality\cite{duessel01,eisma01}. Power outages of tens of minutes up to two
hours reportedly do not cause problems in aluminum potlines and are in fact part of routine operation for purposes such
as electrode changes\cite{eisma01,oye01}.

The power supply system of an aluminum plant is managed through a highly-integrated control system as keeping all cells
of a potline under optimal operating conditions is challenging. Modern power supply systems employ large banks of diodes
or SCRs\footnote{SCRs, also called thyristors, are electronic devices that are often used in high-power switching
applications. They are normally-off devices that act like diodes when a current is fed into their control terminal.} to
rectify low-voltage AC to DC to be fed into the potline\cite{ayoub01}. The potline voltage can be controlled almost
continuously through a combination of a tap changer and a transductor. The individual cell voltages can be controlled by
changing the anode to cathode distance (ACD) by physically lowering or raising the anode.  The potline power supply is
connected to the high voltage input and to the potline through isolators and breakers.

In an aluminum smelter most of the power is sunk into resistive losses and the electrolysis process. As such an
aluminum smelter does not have any significant electromechanical inertia compared to the large rotating machines used
in other industries. Depending on the capabilities of the rectifier controls high slew rates are possible, permitting
modulation at high\footnote{Aluminum smelter rectifiers are \emph{pulse rectifiers}. This means instead of simply
rectifying the incoming three-phase voltage they use a special configuration of transformer secondaries and in some
cases additional coils to produce a large number of equally spaced phases (e.g.\ six) from a standard three-phase input.
Where a direct-connected three-phase rectifier would draw current in six pulses per mains voltage cycle a pulse
rectifier draws current in more, smaller pulses to increase power factor. For example a 12-pulse rectifier will draw
current in 12 pulses per cycle. In the best case an SCR pulse rectifier switched at zero crossing should allow
\SIrange{0}{100}{\percent} load changes from one rectifier pulse to the next, i.e. within a fraction of a single cycle.}
data rates.
% FIXME resolve cut ---

\subsection{Parametrizing DSSS Modulation for GFM}

% FIXME resolve cut/write intro ---
\begin{description}
    \item[Modulation amplitude.] Amplitude is proportionally related to modulation power. In a practical setup we might
        realize a modulation power up to a few hundred \si{\mega\watt} which would yield a few tens of \si{\milli\hertz}
        of frequency amplitude.
    \item[Modulation preemphasis and slew-rate control.] Preemphasis might be necessary to ensure an adequate
        Signal-to-Noise ratio (SNR) at the receiver. Slew-rate control and other shaping measures might be necessary to
        reduce the impact of these sudden load changes on the transmitter's primary function (say, aluminum smelting)
        and to prevent disturbances to other grid components.
    \item[Modulation frequency.] For a practical implementation a careful study would be necessary to determine the
        optimal frequency band for operation. On one hand we need to prevent disturbances to the grid such as the
        excitation of local or inter-area modes. On the other hand we need to optimize Signal-to-Noise ratio (SNR)
        and data rate to achieve optimal latency between transmission start and reset completion and to reduce the
        overall burden on both transmitter and grid.
    \item[Further modulation parameters.] The modulation itself has numerous parameters that are discussed in Section
        \ref{mod_params} below.
\end{description}

% FIXME resolve cut/write intro ---
% FIXME too many enumerations?
In this section we will explore how we can construct a reliable communication channel from the analog primitive we
have outlined in the previous section. Our load control approach to grid frequency modulation leads to a channel with the
following properties.

\begin{description}
    \item[Slow-changing.] Accurate grid frequency measurements take several periods of the mains sine wave. Faster
        sampling rates can be achieved with more complex specialized synchrophasor estimation algorithms but this will
        result in a trade-off between sampling rate and accuracy\cite{belega01}.
    \item[Analog.] Grid frequency is an analog signal.
    \item[Noisy.] While stable over long periods of time thanks to power stations' Load-Frequency Control
        systems\cite{entsoe04} there are considerable random short-term variations. Our modulation amplitude is limited
        by technical and economic constraints so we have to find a system that will work at poor SNRs.
    \item[Polarized.] Grid frequency measurements have an inherent sense of polarity that we can use in our modulation
        scheme.
\end{description}

% FIXME resolve cut ---
Modern power systems are complex electromechanical systems. Each component is controlled by several carefully tuned
feedback loops to ensure voltage, load and frequency regulation. Multiple components are coupled through transmission
lines that themselves exhibit complex dynamic behavior. The overall system is generally stable, but may exhbit
instabilities to particular small-signal stimuli\cite{kundur01,crastan03}. These instabilities, called \emph{modes},
occur when due to mis-tuning of parameters or physical constraints the overall system exhibits oscillation at a
particular frequency.  \cite{kundur01} separates these modes into four categories:

\begin{description}
    \item[Local modes] where a single power station oscillates in some parameter,
    \item[Interarea modes] where subsections of the overall grid oscillate with respect to each other due to weak
        coupling between them,
    \item[Control modes] caused by imperfectly tuned control systems and
    \item[Torsional modes] that originate from electromechanical oscillations in the generator itself.
\end{description}

The oscillation frequencies associated with each of these modes are usually between a few tens of Millihertz and a few
Hertz\cite{grebe01,entsoe01,crastan03}. It is hard to predict the particular modes of a power system at the scale of the
central European interconnected system. Theoretical analysis and simulation may give rough indications but cannot yield
conclusive results. Due to the obvious danger as well as high economical impact due to inefficiencies experimental
measurements are infeasible. Modes are highly dependent on the power grid's structure and will change with changes in
the power grid over time. For all of these reasons, a grid frequency modulation system must be designed very
conservatively without relying on the absence (or presence) of modes at particular frequencies. A concrete design
guideline that we can derive from this situation is that the frequency spectrum of any grid frequency modulation system
should not exhibit large peaks and should avoid a concentration of spectral energy in small frequency bands.
% FIXME resolve cut ---

\subsection{Parametrizing a "Safety Reset" System Based on GFM}
% FIXME resolve cut & write intro ---
% FIXME cut down next 2 sections
\subsubsection{Error-correcting codes}

To reduce reception error rate we have to layer channel coding on top of the DSSS modulation. The messages we expect to
transmit are at least a few tens of bits long. We are highly constrained in SNR due to limited transmission power and
with lower SNR comes higher BER (Bit Error Rate). At a fixed BER, packet error rate grows exponentially with
transmission length so for our relatively long transmissions we would realistically get unacceptable error rates.

Error correcting codes are a very broad field with many options for specialization. Since we are implementing only an
advanced prototype in this thesis we chose to spend only limited resources on optimization and settled on a basic
Reed-Solomon code. We have no doubt that applying a more state-of-the-art code we could gain further improvements in
code overhead and decoding speed among others\cite{mackay01}. Since message length in our system limits system response
time but we do not have a fixed target we can tolerate some degree of overhead.  Decoding speed is of very low concern
to us because our data rate is extremely low.  We derived our implementation by adapting and optimizing an existing open
source decoder that we validated on an open source encoder implementation. We generate test signals using a Python tool
on the host.

\subsubsection{Cryptographic security}
\label{sec-crypto}
Above the communication base layer elaborated in the previous section we have to layer a cryptographic protocol to
ensure system security. We want to avoid a case where a third party could interfere with our system or even subvert this
safety system itself for an attack.  From a protocol security perspective the system we are looking for can informally
be modelled as consisting of three parties: the trusted \emph{transmitter}, one of a large number of untrusted
\emph{receivers}, and an \emph{attacker}.  These three play according to the following rules:

\begin{description}
    \item[Access.] Both transmitter and attacker can transmit any bit sequence.
    \item[Indistinguishability.] The receiver receives any transmission by either but cannot distinguish between them.
    \item[Kerckhoff's principle.] Since the protocol design is public and anyone can get access to an electricity meter
        the attacker knows anything any receiver might know\cite{kerckhoff01,kerckhoff02}.
    \item[Priority.] The transmitter is stronger than an attacker and will ``win'' during simultaneous transmission.
    \item[Seeding.] Both transmitter and receiver can be seeded out-of-band with some information on each other such as
        public key fingerprints.
\end{description}

We are not considering situations where an attacker attempts to jam an ongoing transmission. In practice there are
several avenues to prevent such attempts. Compromised large loads that are being abused by the attacker can be manually
disconnected by the utility. Error-correcting codes can be used to provide resiliency against small-scale disturbances.
Finally, the transmitter can be designed to have high enough power to be able to override any likely attacker.

With the above properties in mind our goal is to find a cryptographic primitive that has the following properties:
\begin{description}
    \item[Authentication.] The transmitter can produce a message bit sequence that a certain subset of receivers can
        identify as being generated by the transmitter. On reception of this sequence, all addressed receivers perform a
        safety reset.
    \item[Unforgeability.] The attacker cannot forge a message, i.e.\ find a bit sequence other than one of the
        transmitter's previous messages that a receiver would accept. This implies that the attacker also cannot create
        a new distinct message from a previously transmitted message.
    \item[Brevity.] The message should be short. Our communication channel is outrageously slow compared to anything
        else used in modern telecommunications and every bit counts.
\end{description}

On a protocol level we also have to ensure \emph{idempotence}. Our system should have an at-most-once semantic. This
means for a given message each receiver either performs exactly one safety reset or none at all, even if the message is
re-transmitted by either the transmitter or an attacker.  We cannot achieve the ideal exactly-once semantic wit pure
protocol gymnastics since we are using an unidirectional lossy communication primitive. A receiver might be offline
(e.g.\ due to a local power outage) and then would not hear the transmission even if our broadcast primitive was
reliable. Since there is no back channel, the transmitter has no way of telling when that happens.  The practical impact
of this can be mitigated by the transmitter repeating the message a number of times.

It follows from the unforgeability requirement that we can trivially reach idempotence at the protocol level by keeping
a database of all previous messages and only accepting new messages. By considering this in our cryptographic design we
can reduce the storage overhead of this ``database''.

Along with the indistinguishability property the access requirement implies that we need a cryptographic
signature\cite{lamport01}.  However, we have relaxed constraints on this signature compared to standard cryptographic
practice\cite{anderson04}.  While cryptographic signatures need to work over arbitrary inputs, all we want to ``sign''
here is the instruction to perform a safety reset. This is the only message we might ever want to transmit so our
message space has only one element. The information content of our message thus is 0 bit! All the information we want to
transmit is already encoded \emph{in the fact that we are transmitting} and we do not require a further payload to be
transmitted: We can omit the entirety of the message and just transmit whatever ``signature'' we
produce\cite{haller01,rfc1760}. This is useful to conserve transmission bits so our transmission does not take an
exceedingly long time over our extremely slow communication channel.

We can modify this construction to allow for a small number of bits of information content in our message (say two or
three instead of zero) at no transmission overhead by transmitting the cryptographic signature as usual but simply
omitting the message. The message contains only a few bits of information and we are dealing with minutes of
transmission time so the receiver can reconstruct the message through brute-force. Though this trade-off between
computation and data transmission might seem inelegant it does work for our extremely slow link for up to a few bits of
information.

There is an important limitation in the rules of our setup above: The attacker can always record the reset bit sequence
the transmitter transmits and replay that same sequence later. Even without cryptography we can trivially prevent an
attacker from violating the at-most-once criterion. If every receiver memorizes all bit sequences that have been
transmitted so far it can detect replays. With this mitigation by replaying an older authentic transmission an attacker
can cause receivers that were offline during the original transmission to reset at a later point. Considering our goal
is to reset them in the first place this should not pose a threat to the system's safety or security.

A possible scenario would be that an attacker first causes enough havoc for authorities to trigger a safety reset. The
attacker would record the trigger transmission. We can assume most meters were reset during the attack. Due to this the
attacker cannot cause a significant number of additional resets immediately afterwards.  However, the attacker could
wait several years for a number of new meters to be installed that might not yet have updated firmware that includes the
last transmission. This means the attacker could cause them to reset by replaying the original sequence.

A possible mitigation for this risk would be to introduce one bit of information into the trigger message that is
ignored by the replay protection mechanism.  This \emph{enable} bit would be $1$ for the actual reset trigger message.
After the attack the transmitter would then perform scheduled transmissions of a ``disarm'' message that has this bit
set to $0$. This message informs all new meters and meters that were offline during the original transmission of the
original transmission for replay protection without actually performing any further resets.

We could use any of several traditional asymmetric cryptographic primitives to produce these signatures. The
comparatively high computational effort required for signature verification would not be an issue. Transmissions take
several minutes anyway and we can afford to spend some tens of seconds even in signature verification. Transmission
length and by proxy system latency would be determined by the length of the signature. For RSA signature length is the
modulus length (i.e. larger than \SI{1000}{bit} for very basic contemporary security). For elliptic curve-based systems
curve length is approximately twice the security level and signature size is twice the curve length because two curve
points need to be encoded\cite{anderson02}.  For contemporary security this results in more than 300 bit transmission
length. We can exploit our unique setting's low message entropy to improve on this by basing our scheme on a
cryptographic hash function used as a one-way pseudo-random function (PRF). Hash-based signature schemes date back to
the very beginnings of cryptographic signatures\cite{anderson04,diffie01,lamport02}. Today, in general applications
schemes based on asymmetric cryptography are preferred but hash-based signature systems have their applications in
certain use cases. One example of such a scheme is the TESLA scheme\cite{perrig01} that is the basis for navigation
message authentication in the European Galileo global navigation satellite system. Here, a system based purely on
asymmetric primitives would result in too much computation and communication overhead\cite{ec05}.  In the following
sections we will introduce the foundations of hash-based signatures before deriving our authentication scheme.

\subsubsection{Lamport signatures}

1979, Lamport in \cite{lamport02} introduced a signature scheme that is based only on a one-way function such as a
cryptographic hash function. The basic observation is that by choosing a random secret input to a one-way function and
publishing the output, one can later prove knowledge of the input simply by publishing it. In the following paragraphs
we will describe a construction of a one-time signature scheme based on this observation. The scheme we describe is the
one usually called a ``Lamport Signature'' in modern literature but is slightly different from the variant described in
the 1979 paper. For our purposes we can consider both to be equivalent.

\paragraph{Setup.} In a Lamport signature, for an n-bit hash function $H$ the signer generates a private key $s =
\left(s_{b, i} | b\in\left\{0, 1\right\}, 0\le i<n\right)$ of $2n$ random strings of length $n$. The signer publishes a
public key $p = \left(p_{b, i} = H\left(s_{b, i}\right), b\in\left\{0, 1\right\}, 0\le i<n\right)$ that is simply the
list of hashes of each of the random strings that make up the private key.

\paragraph{Signing.} To sign a message $m$, the signer publishes the signature $\sigma = \left(\sigma_i = k_{H(m)_i,
i}\right)$ where $H(m)_i$ is the $i$-th bit of $H$ applied to $m$. That is, for the $i$-th bit of the message's hash
$H(m)$ the signer publishes either of $p_{0, i}$ or $p_{1, i}$ depending on the hash bit's value, keeping the other
entry of $P$ secret.

\paragraph{Verification.} The verifier can compute $H(m)$ themselves and check the corresponding entries $\sigma_i =
k_{H(m)_i}$ of $S$ correctly evaluate to $p_{b, i} = H\left(s_{b, i}\right)$ from $P$ under $H$.

The above scheme is a one-time signature scheme only. After one signature has been published for a given key, the
corresponding key must not be reüsed for other signatures. This is intuitively clear as we are effectively publishing
part of the private key as the signature, and if we were to publish a signature for another message an attacker could
derive additional signatures by ``mixing'' the two published signatures.

\subsubsection{Winternitz signatures}

An improvement to basic Lamport signatures as described above are Winternitz signatures as detailed in
\cite{merkle01,dods01}. Winternitz signatures reduce public key length as well as signature length for hash length $n$
from $2n$ to $\mathcal O \left(n/t\right)$ for some choice of parameter $t$ (usually a small number such as 4).

\paragraph{Setup.} The signer generates a private key $s = \left(s_i\right)$ consisting of $\lceil\frac{n}{t}\rceil$ random
bit strings. The signer publishes a public key $p = \left(H^{2^t}\left(s_i\right)\right)$ where each element
$H^{2^t}\left(s_i\right)$ is the $2^t$-fold recursive application of $H$ to $s_i$.

\paragraph{Signing.} The signer splits $m$ padded to a multiple of $t$ bits into $\lceil\frac{n}{t}\rceil$ chunks $m_i$ of
$t$ bit each. The signer publishes the signature $\sigma = \left( \sigma_i = H^{m_i}\left(s_i\right) \right)$.

\paragraph{Verification.} The verifier can calculate for each $\sigma_i = H^{m_i}\left(s_i\right)$ that $H^{2^t -
m_i}\left(\sigma_i\right) = H^{2^t - m_i}\left(H^{m_i}\left(s_i\right)\right) = H^{2^t - m_i + m_i} \left(s_i\right) =
p_i$.

To prevent an attacker from forging additional signatures from one signature by calculating $\sigma_i' =
H\left(\sigma_i\right)$ matching $m_i' = m_i + 1$, this scheme is usually paired with a simple checksum as described in
\cite{merkle01}.

\subsubsection{Using hash-based signatures for trigger authentication}

Applying these concepts the most basic trigger authentication scheme possible would be to simply generate a random
secret key bit string $s$ and publish $p = H(s)$ for some hash function $H$. To activate the trigger, $\sigma = s$ is
published and receivers verify that $H(\sigma) = p = H(s)$. This simplistic scheme has one main disadvantage: It is a
fundamentally one-time construction. To prevent an attacker from re-triggering a receiver a second time by replaying a
valid trigger $\sigma$ all receivers have to blacklist any ``used'' $\sigma$. Alas, this means we can only ever trigger
a receiver \emph{once}.  The good part is that any receiver that missed this trigger can still be triggered later, but
the bad part is that once $s$ is burned we are out of options. The trivial solution to this would be to simply provision
each receiver with a whole list of public keys in advance. This however takes $n$ times the amount of space for $n$-fold
retriggerability and for each one we have to memorize separately whether it has been used up. Luckily we can easily
derive a scheme that yields $n$-fold retriggerability and naturally memorizes replay state while using no more space
than the original scheme by taking some inspiration from Winternitz signatures.

In this improved scheme the secret key $s$ is still a random bit string. The public key is $p = H^n(s)$ for $n$-times
retriggerability.  The $i$-th time the trigger is activated, $\sigma_i = H^{n-i}(s)$ is published, and every receiver
can verify that $\sigma_{i-1} = H\left(\sigma_i\right)$ with $\sigma_0 = p$. In case a receiver missed one or more
previous triggers it continues computing $H\left(H\left(\sigma_i\right)\right)$ and
$H\left(H\left(H\left(\sigma_i\right)\right)\right)$ and so on until either reaching the $n$-th recursion
level--indicating an invalid signature--or finding $H^n\left(\sigma_i\right) = \sigma_j$ with $\sigma_j$ being the last
signature this receiver recorded or $p$ in case there is none.

This scheme provides replay protection since the receiver memorizes the last signature they acted on. Public key length
is equal to the length of the hash function $H$ used. Even for our embedded systems use case $n$ can realistically be up
to $\mathcal O\left(10^3\right)$, which is enough for our purposes. This use of a hash chain for event authentication is
identical to the one in the S/KEY one-time password system\cite{anderson04,haller01,rfc1760}.
% 1990ies crypto yeah!

The ``disarm'' message we discussed above for replay protection can be integrated into this scheme by encoding the
``enable'' bit into the least significant bit of $n$ in our $H^n$ construction. In the chain of valid signatures every
second one would be a disarm signature: Reset and disarm signatures would alternate in this scheme. By skipping a disarm
signature two resets can still be triggered directly after one another.

In practice it may be useful to have some control over which meters reset. An attack exploiting a particular network
protocol implementation flaw might only affect one series of meters made by one manufacturer. Resetting \emph{all}
meters may be too much in this case. A simple solution for this is to define addressable subsets of meters.  ``All
meters'' along with ``meters made by manufacturer $x$'' and ``meters of model $y$'' are good choices for such scopes. On
the cryptographic level the protocol state is simply duplicated for each scope. This incurs memory and computation
overhead linear in the number of scopes but device memory requirements are small at a few bytes only and computation is
of no concern due to the very slow channel so this simple solution is adequate. The transmitter has to either store
copies of all scope's keys or derive these keys from a root key using the scope's identifier. Keys are small and the
transmitter would be using a regular server or hardware security module for key management so either easily feasible.

A diagram of the key structure in this key management scheme is shown in Figure \ref{fig:sig_key_chain}. The
transmitter key management is shown in Figure \ref{fig:tx_scope_key_illu}. This scheme is simplistic but suffices for
our prototype in Section \ref{sec-prototype} and may even be useful in a practical implementation. During
standardization of a safety reset system the key management system would most likely have to be customized to the
particular application's requirements. Developing an universal solution is outside the scope of this work.


% FIXME resolve cut ---

\subsection{Simulation Results}

\section{Discussion}
\label{sec_conclusion} 

\printbibliography[heading=bibintoc]

%%% FIXME remove appendix and work into text.

\center{
    \center{This is version \texttt{\input{version.tex}\unskip} of this paper, generated on \today. The git repository
    can be found at:}

    \center{\url{https://git.jaseg.de/safety-reset.git}}
}
\end{document}