summaryrefslogtreecommitdiff
path: root/paper/safety-reset-paper.tex
blob: f4e463315c94e7e9bf62564c6e780ec150d76cfc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
\documentclass[sigconf]{acmart}

\usepackage[binary-units]{siunitx}
\DeclareSIUnit{\baud}{Bd}
\DeclareSIUnit{\year}{a}
\usepackage{graphicx,color}
\usepackage{subcaption}
\usepackage{array}
\usepackage{hyperref}
\usepackage{enumitem}

\renewcommand{\floatpagefraction}{.8}
\newcommand{\degree}{\ensuremath{^\circ}}
\newcolumntype{P}[1]{>{\centering\arraybackslash}p{#1}}
\newcommand{\partnum}[1]{\texttt{#1}}

% https://eepublicdownloads.entsoe.eu/clean-documents/pre2015/publications/entsoe/Operation_Handbook/Policy_1_Appendix%20_final.pdf

%\keywords{Security, privacy and resilience in critical infrastructures \and Security and privacy in ``internet of
%things'' \and Cyber-physical systems \and Hardware security \and Network Security \and Energy systems \and Signal theory}

\copyrightyear{2022}
\acmYear{2022}
\setcopyright{rightsretained}
\acmConference[ACSAC]{Annual Computer Security Applications Conference}{December 5--9, 2022}{Austin, TX, USA}
\acmBooktitle{Annual Computer Security Applications Conference (ACSAC), December 5--9, 2022, Austin, TX, USA}
\acmDOI{10.1145/3564625.3564640}
\acmISBN{978-1-4503-9759-9/22/12}

\begin{document}

\acmConference[ACSAC '22]{Annual Computer Security Applications
Conference}{December 5--9}{Austin, TX, USA}

\title{
  Ripples in the Pond: Transmitting Information through Grid Frequency Modulation
}

\author{Jan Sebastian Götte}
\affiliation{
  \institution{Technische Universität Darmstadt}
  \city{Darmstadt}
  \country{Germany}
}
\email{research@jaseg.de}

\author{Liran Katzir}
\affiliation{
  \institution{Tel Aviv University}
  \city{Tel Aviv}
  \country{Israel}
}
\email{lirankatzir@tau.ac.il}

\author{Björn Scheuermann}
\affiliation{
  \institution{Technische Universität Darmstadt}
  \city{Darmstadt}
  \country{Germany}
}
\email{scheuermann@kom.tu-darmstadt.de}

\renewcommand{\shortauthors}{Götte, Katzir and Scheuermann}
\begin{CCSXML}
<ccs2012>
<concept>
<concept_id>10010583.10010662.10010668.10010671</concept_id>
<concept_desc>Hardware~Power networks</concept_desc>
<concept_significance>500</concept_significance>
</concept>
<concept>
<concept_id>10010583.10010662.10010668.10010672</concept_id>
<concept_desc>Hardware~Smart grid</concept_desc>
<concept_significance>300</concept_significance>
</concept>
<concept>
<concept_id>10010583.10010750.10010769</concept_id>
<concept_desc>Hardware~Safety critical systems</concept_desc>
<concept_significance>500</concept_significance>
</concept>
<concept>
<concept_id>10010520.10010553.10010562.10010561</concept_id>
<concept_desc>Computer systems organization~Firmware</concept_desc>
<concept_significance>300</concept_significance>
</concept>
<concept>
<concept_id>10010520.10010553.10010562.10010563</concept_id>
<concept_desc>Computer systems organization~Embedded hardware</concept_desc>
<concept_significance>300</concept_significance>
</concept>
<concept>
<concept_id>10002978.10002997.10002998</concept_id>
<concept_desc>Security and privacy~Malware and its mitigation</concept_desc>
<concept_significance>300</concept_significance>
</concept>
<concept>
<concept_id>10002978.10003001.10003003</concept_id>
<concept_desc>Security and privacy~Embedded systems security</concept_desc>
<concept_significance>500</concept_significance>
</concept>
<concept>
<concept_id>10002978.10003001.10003599.10011621</concept_id>
<concept_desc>Security and privacy~Hardware-based security protocols</concept_desc>
<concept_significance>300</concept_significance>
</concept>
</ccs2012>
\end{CCSXML}

\ccsdesc[500]{Hardware~Power networks}
\ccsdesc[300]{Hardware~Smart grid}
\ccsdesc[500]{Hardware~Safety critical systems}
\ccsdesc[300]{Security and privacy~Malware and its mitigation}
\ccsdesc[500]{Security and privacy~Embedded systems security}
\ccsdesc[300]{Security and privacy~Hardware-based security protocols}

\begin{abstract}
    The growing heterogenous ecosystem of networked consumer devices such as smart meters or IoT-connected appliances
    such as air conditioners is difficult to secure, unlike the utility side of the grid which can be defended
    effectively through rigorous IT security measures such as isolated control networks. In this paper, we consider a
    crisis scenario in which an attacker compromises a large number of consumer-side devices and modulates their
    electrical power to destabilize the grid and cause an electrical
    outage~\cite{ctap+11,wu01,zlmz+21,kgma21,smp18,hcb19}.
    
    In this paper propose a broadcast channel based on the modulation of grid frequency through which utility operators
    can issue commands to devices at the consumer premises both during an attack for mitigation and in its wake to aid
    recovery. Our proposed grid frequency modulation (GFM) channel is independent of other telecommunication networks.
    It is resilient towards localized blackouts and it is operational immediately after power is restored.

    Based on our GFM broadcast channel we propose a ``safety reset'' system to mitigate an ongoing attack by disabling a
    device's network interfaces and resetting its control functions. It can also be used in the wake of an attack to aid
    recovery by shutting down non-essential loads to reduce strain on the grid.

    To validate our proposed design, we conducted simulations based on measured grid frequency behavior. Based on these
    simulations, we performed an experimental validation on simulated grid voltage waveforms using a smart meter
    equipped with a prototype safety reset system based on a commodity microcontroller.
\end{abstract}

\maketitle

\section{Introduction}

With the rollout of the smart grid, the IT security of electrical infrastructure has attracted increased attention in
the last years. Smart Grid security has two major components: The security of central SCADA systems, and the security
of equipment at the consumer premises such as smart meters and IoT devices. While there is previous work on both sides,
their interactions have not yet received much attention.

We consider the previously proposed scenario where a large number of compromised consumer devices is used alone or in
conjunction with an attack on the grid's central SCADA systems to destabilize the grid by rapidly modulating the total
connected load~\cite{ctap+11,wu01,zlmz+21,kgma21,smp18,hcb19}. Several devices have been identified as likely targets
for such an attack including smart meters with integrated remote disconnect switches~\cite{ctap+11,anderson01}, large
IoT-connected appliances~\cite{smp18,hcb19,chl20,olkd20} and electric vehicle chargers~\cite{kgma21,zlmz+21,olkd20}.
Such attacks are hard to mitigate, and existing literature focuses on hardening grid control
systems~\cite{kgma21,lzlw+20,lam21,zlmz+21} and device firmware\cite{mpdm+10,smp18,zb20,yomu+20} to prevent compromise.
Despite the infeasibility of perfect firmware security, there is little research on \emph{post-compromise} mitigation
approaches. A core issue with post-attack mitigation is that network connections such as internet and cellular networks
between the utility and devices on consumer premises may not work due to the attack. Thus, mitigation strategies that
involve devices on the consumer premises will need an out-of-band communication channel.

In this paper, we propose a novel, resilient, grid-wide communication technique based on \emph{grid frequency
modulation} (GFM) that can be used to broadcast short messages to all devices connected to the electrical grid. The grid
frequency modulation channel is robust and can be used even during an ongoing attack. Based on our channel we propose
the \emph{safety reset} controller, an attack mitigation technique that is compatible with most smart meter and IoT
device designs. A safety reset controller is a separate controller integrated with the device that awaits an out-of-band
reset command transmitted through GFM. Upon reception of the reset command, it puts the device into a safe state (e.g.
\emph{heater off} or \emph{light on}) that interrupts attacker control over the device. To reduce attack surface and
cost, the safety reset controller is separated from the system's main application controller and does not have any
conventional network interfaces.

The grid frequency modulation channel can be operated by transmission system operators (TSOs) even during black-start
recovery procedures and it bridges the gap between the TSO's private control network and consumer devices that can not
economically be equipped with other resilient communication techniques such as satellite transceivers. To demonstrate
our proposed channel, we have implemented a system that transmits error-corrected and cryptographically secured commands
through an emulated grid frequency-modulated voltage waveform to an off-the-shelf smart meter equipped with a prototype
safety reset controller based on a small off-the-shelf microcontroller.

The frequency behavior of the electrical grid can be analyzed by examining the grid as a large collection of mechanical
oscillators coupled through the grid via the electromotive force~\cite{rogers01,wcje+12}. The generators and motors that
are electromagnetically coupled through the grid's transmission lines and transformers run synchronously with each
other, with only minor localized variations in their rotation angle. The dynamic behavior of grid frequency is a direct
product of this electromechanical coupling: With increasing load, frequency drops because turbines move slower under
higher torque, and consequentially with decreasing load frequency rises. Industrial control systems keep frequency close
to its nominal value over time spans of minutes or hours, but over shorter time spans the combined inertia of all
grid-connected generators and motors is what regulates frequency.

Grid frequency modulation works by quickly modulating the power of a large, grid-connected load or generator. When this
modulation is at low amplitude and high frequency, it is below the thresholds set for the grid's automated control
systems and monitoring systems and it will directly affect frequency according to the grid's inertia. GFM differs from
traditional Powerline Communication (PLC) systems in that it works at much lower frequencies, it directly modulates the
grid's fundamental frequency instead of superimposing an additional signal on top of it, and by nature it reaches every
device within one synchronous area as the signal is embedded into the fundamental grid frequency. Traditional PLC uses a
superimposed voltage, which is quickly attenuated across long distances. Practically speaking, using GFM a single large
transmitter can cover an entire synchronous area, while in traditional PLC hundreds or thousands of smaller transmitters
would be necessary. Unlike traditional PLC, any large industrial load that allows for fast computer control with slew
rates in the order of several percent of total load per second can act as a GFM transmitter with minimal or no hardware
modifications.

\begin{figure}
    \centering
    \includegraphics[width=0.45\textwidth]{flowchart}
    \caption{Structural overview of our concept. 1 - Government authority or utility operations center. 2 - Emergency
    radio link. 3 - Aluminium smelter. 4 - Electrical grid. 5 - Target smart meter.}
    \Description{A schematic overview of the safety reset system with its parts represented by icons. A signal is sent
    from a radio tower next to a government building to a radio tower next to a factory. The factory forwards this
    signal to the electrical grid, where it is transmitted through a series of transformers to a smart meter at a
    residential building.}
    \label{fig_intro_flowchart}
\end{figure}

Figure~\ref{fig_intro_flowchart} shows an overview of our concept using a smart meter as the target device and a large
aluminium smelter temporarily re-purposed as a GFM transmitter.  Two scenarios for its application are before or during
a cyber attack, to stop an attack on the electrical grid in its tracks, and after an attack while power is being
restored to prevent a repeated attack. In both scenarios, our concept is independent of telecommunication networks (such
as the internet or cellular networks) as well as broadcast systems (such as cable television or terrestrial broadcast
radio) while requiring only inexpensive signal processing hardware and no external antennas (such as are needed for
satellite communication). A grid frequency-based system can function as long as power is still available, or as soon as
power is restored after the attack. One powerful function this allows is ``flushing out`` an attacker from compromised
smart meters after an attack, before restoring smart meter internet connectivity.

Using simulations we have determined that control of a $\SI{25}{\mega\watt}$ load such as a large aluminium smelter,
load bank or photovoltaic farm would allow for the transmission of a cryptographically secured safety reset signal
within $15$ minutes. We have designed and constructed a proof-of-concept prototype receiver that demonstrates the
feasibility of decoding such signals on a resource-constrained microcontroller.

\subsection{Motivation}

Consumer devices are increasingly becoming \emph{smart}. Large numbers of IoT devices are connected through the public
internet, and in several countries internet-connected Smart Meters can disconnect entire households from the grid in
case of unpaid bills~\cite{anderson01}. The increasing proliferation of smart devices on the consumer side presents an
opportunity to grid operators, who rely on forecasts for the cost-optimized control of generation and power flow. The
core of the \emph{Smart Grid} vision is that utilities can now gather detailed data for more accurate consumption
forecasts, and in some cases can even adjust parameters of large devices like water heaters to smooth out load spikes.

However, this increased degree of visibility and control comes with an increased IT security risk. In this paper we
focus on scenarios where an attacker compromises a large number of grid-connected remote-controllable devices. This may
be simple smart home devices such as IoT-connected air conditioners, but it may also include Smart Meters that are
outfitted with a remote disconnect switch as is common in some countries. By rapidly switching large numbers of such
devices in a coordinated manner, the attacker has the opportunity to de-stabilize the electrical
grid~\cite{zlmz+21,kgma21,smp18,hcb19}.

In this paper, we focus on assisting the recovery procedure after a successful attack because we estimate that this
approach will yield a better return of investment in overall grid stability versus resources spent on security
measures compared to bug hunting in device firmware. Previous work on IoT and Smart Grid security has focused on the
prevention of attacks though firmware security measures. While research on prevention is important, we estimate that its
practical impact will be limited by the diversity of implementations found in the field~\cite{nbck+19,zlmz+21,smp18}. We
predict that it would be a Sisyphean task to secure the firmware of a number of devices devices sufficient to deny an
attacker the critical mass needed to cause trouble. Even if all flaws in the firmware of a broad range of devices would
be fixed, users still have to update. In smart grid and IoT devices, this presents a difficult problem since user
awareness is low~\cite{nbck+19}.

\subsection{Attacker model}

According to the above criteria, our attacker model has the following key features:

\begin{itemize}
    \item The attacker cannot compromise the utility operators' SCADA systems.
    \item The attacker can compromise and subsequently control a large number of target devices at the customer's
        premises such as smart meters or large IoT devices such as air conditioners or central heating systems.
    \item Devices that may become targets of attacks can be designed to include a separate firmware and factory reset
        function that the attacker cannot circumvent. In the simplest case, this could be a separate microcontroller
        that is connected to an in-system programming interface of the device's application processor.
\end{itemize}

\subsection{Contents}

Starting from a high level architecture, we have carried out simulations of our concept's performance under real-world
conditions using measured grid frequency data. Based on these simulations we implemented an end-to-end prototype of our
proposed safety reset controller as part of a realistic smart meter demonstrator. Finally, we experimentally validated
our results based on a simulated mains voltage signal and we will conclude with an outline of further steps towards a
practical implementation.

This work contains the following contributions:
\begin{enumerate}[topsep=4pt]
    \item We introduce Grid Frequency Modulation (GFM) as a communication primitive. % FIXME done before in that one paper
    \item We elaborate the fundamental physics underlying GFM and theorize on the constrains of a practical
        implementation.
    \item We design a communication system based on GFM.
    \item We carry out extensive simulations of our systems to determine its performance characteristics.
\end{enumerate}

%\subsection{Notation}
% FIXME drop or rework this section ; actually update notation to be consistent throughout
%To a computer scientist there is one confusing aspect to the theory of grid frequency modulation. GFM can be seen as a
%frequency modulation (FM) with a baseband signal in the band below approximately $f_m = \SI{5}{\hertz}$ that is
%modulated on top of a carrier signal at $f_c = \SI{50}{\hertz}$ in case of the European electrical grid. The frequency
%deviation $f_\Delta$ that the modulated carrier deviates from its nominal value of $f_m$ is very small at only a few
%milli-Hertz.
%
%When grid frequency is measured by first digitizing the mains voltage waveform, then de-modulating digitally, the FM's
%signal-to-noise ratio (SNR) is very high and is dominated by the ADC's quantization noise and nearby mains voltage noise
%sources such as resistive droop due to large inrush current of nearby machines.
%
%Note that both the carrier signal at $f_c$ and the modulation signal at $f_m$ both have unit Hertz. To disambiguate
%them, in this paper we will use \textbf{bold} letters to refer to the carrier waveform $\mathbf{U}$ or frequency
%$\mathbf{f_c}$ as well as its deviation $\mathbf{f_\Delta}$, and we will use normal weight for the actual modulation
%signal and its properties such as $f_m$.

\section{Background on the electrical grid}
\subsection{Components and interactions}

The electrical grid transmits electrical power from generators to loads through alternating current. Any device that is
connected to the grid must run \emph{synchronous} with the grid, i.e.\ it must produce or consume power following the
grid's voltage waveform. In generators and motors, the electromotive force acts to synchronize the device with the grid.
Connecting a generator that has not been synchronized to the grid leads to large currents flowing through the
generator's windings, inducing extreme forces that can mechanically destroy the generator. Similarly, if the inverters
of a solar power station would try to fight the grid, the grid would win and the inverters' power semiconductors would
release their magic smoke.

Originally, all power sources on the grid were synchronous rotating generators. Today, the shift towards renewable
energy and the introduction of high-voltage DC links has led to some of the grid's generating capacity being replaced
with inverters that electronically emulate the grid's voltage waveform to efficiently convert a DC input to the grid's
alternating current.

The generators and loads on the grid are linked through a complex network of transmission lines. Transformers are used
to couple between transmission lines operating at different voltage levels, and several types of switches allow
utilities to steer power flow throughout this network. Through the electromotive force, all synchronous generators
connected to the grid are electromechanically coupled. Transmission lines introduce a (small) phase delay to the
electric fields traversing the grid, but besides local differences in phase, all parts of the grid are synchronous.  

\subsection{Grid frequency behavior}

On the electrical grid, generation and consumption of energy must be precisely matched at all times for the grid to stay
at a constant, synchronous frequency. If generation outpaces consumption, generators would provide less mechanical
resistance to their source of mechanical power, or \emph{prime mover}, which would lead the generators to spin faster
and faster. Similarly, if consumption outpaced production, the increased mechanical load would slow down generators,
ultimately leading to a collapse.

In day-to-day operation, the frequency of the electrical grid is maintained at a fixed, stable level through several
layers of control systems on top of the grid's inherent mechanical inertia. Fast-acting automatic primary control
stabilizes temporary frequency excursions, while slower automatic secondary control and manual tertiary control
re-adjust device's operating points back to their nominal values after they have shifted due to primary control action. 

\subsection{Black-start recovery}

To function, the grid relies on a delicate balance between electricity generation, transmission and consumption.  When
this balance is disturbed, cascading failures can occur and because this balance must be kept in balance at all times,
the recovery from a large-scale power outage is a complex operational challenge. Since all consumers and producers that
are connected to the electrical grid are physically coupled through the electromotive force, a fault in one part of the
grid affects all devices connected across the grid. A transmission line shutting off can lead other, nearby lines to
overload and shut off, and a generator or consumer suddenly shutting off causes a transient in the grid's frequency. If
the frequency goes too far out of bounds, protection devices take power plants and large industrial loads offline.

The recovery from a large-scale outage requires the grid's operators to bring generators and loads back online one by
one while continuously maintaining balance between generation and consumption to avoid their protection devices shutting
them down again. To coordinate this process, transmission system operators cannot rely on the public internet or
cellular networks, as they may not work during a large-scale power outage. Instead, they maintain private communication
infrastructure using dedicated lines rented from telecommunication providers, fibers run along transmission lines, and
dedicated radio links.

To start from a complete outage, first a number of \emph{black start}-capable power stations that can start by
themselves without any external power are brought online. With their help, other power stations and consumers are
gradually brought online until a part of the grid has been restored to nominal operation. This process can be performed
simultaneously in different parts of the grid. After these \emph{islands} have been restored, they can then be
synchronized and re-joined to restore the grid to its normal state.

\subsection{Demand-side response and Smart Metering}

Maintaining the balance between electricity generation and consumption under varying load conditions is critical.
Utilities can access different energy sources, each of which have their own trade-off in response speed versus energy
cost. For instance, the availability of wind and solar power cannot be controlled at all, while hydroelectric power
plants can quickly regulate the speed and power output of their turbines. Combined with the complex layout of the grid's
infrastructure such as transmission lines, these economical factors lead to a complex optimization problem, the quality
of whose solution directly manifests itself in the utility's bottom line.

For decades, one solution to this issue has been demand-side response (DSR)~\cite{rs48}. In DSR, large loads such as
water heaters are centrally controlled by the utility to switch on outside of peak demand. Since the precise timing of
these loads is of no consequence to their user, users are happy to get slightly better prices from their utility while
utilities gain a degree of control allowing them to optimize their network's performance. As part of the smart grid
vision, DSR will be utilized in a larger fraction of consumer devices.

A core component of the smart grid is the rollout of ``Advanced Metering Infrastructure'' (AMI), colloquially known as
smart meters. Smart meters are electricity meters that use a real-time communication interface to automatically transmit
high-resolution measurements to the utility. In contrast to the yearly reading schedule of traditional electricity
meters, smart meters can provide near-realtime data that the utility can use for more accurate load forecasting.

\subsection{Powerline Communication (PLC)}

A core issue in smart metering and demand-side response is the communication channel from the meter to the greater
world. Smart meters are cost-constrained devices, which limits the use of landline internet or cellular connections.
Additionally, electricity meters are often installed in basements, far away from the customer's router and with soil and
concrete blocking radio signals. For these reasons, in some AMI deployments, powerline communication (PLC) has been
chosen for the meters' uplink.

Since the early days of the electrical grid, powerline communication has been used to control devices spread throughout
the grid from a central transmitter~\cite{rs48}. PLC systems super-impose a modulated higher-frequency signal on top of
the grid voltage. When the carrier frequency of this modulation is in the audible frequency range, low data
rates can be transmitted over distances of several tens of kilometers. By using a radio frequency carrier, higher data
rates can be achieved across shorter distances\cite{pvyh03}. Audio frequency PLC, called ``ripple control'', is still
used today by utilities for demand-side response, remote-controlling special water heaters to avoid times of
peak electricity demand.

Powerline communication systems are usually uni-directional, but there are instances of bi-directional powerline
communication for smart meter reading~\cite{ec03,rs48,gungor01,agf16}.

\section{Related work} 
\label{sec_related_work}

The security of IoT devices as well as the smart grid has received extensive attention in the
literature~\cite{nbck+19,acsc20,smp18,ykll17,anderson01,anderson02,zlmz+21,kgma21,hcb19,mpdm+10,lzlw+20,chl20,lam21,olkd20,yomu+20}.
The challenges of IoT device security and the security of smart meters and other smart grid devices are similar because
smart grid devices are essentially IoT devices in a particularly sensitive location~\cite{zheng01,ifixit01,acsc20}. In
both device types, the challenge is that securing embedded firmware is difficult, and adding network interfaces and cost
constraints only makes the task harder.

In some countries, smart meters can have a built-in off-switch that is used to disconnect customers who do not pay their
electricity bill. An attack scenario in which the attacker compromises a large number of such meters has been discussed
by Anderson and Fuloria in~\cite{anderson01}. In meters that do not have such a switch, an attacker can still use their
access to manipulate the meter's energy accounting, leading to financial impact on the utility operating the meter. This
scenario has received research attention~\cite{anderson02,mcdaniel01} and comes with the most direct industry
incentives.

In~\cite{smp18}, Soltan, Mittal and Poor investigated an attack scenario where an attacker first gains control over a
large number of high wattage devices through an IoT security vulnerability, then uses this control to cause rapid load
spikes. The researchers performed computer simulations for a range of parameters and concluded that an attacker
controlling 200 - 300 devices of $\SI{1}{\kilo\watt}$ each per megawatt of total grid power (equivalent to
30\% of total connected power) can cause a large-scale blackout in a healthy grid, while 10 such compromised
devices per megawatt (1\% of total power) are enough to cause cascading line failures that may ultimately lead
up to a large-scale blackout.

In~\cite{hcb19}, Huang, Cardenas and Baldick raised a counter-point to the conclusions of Soltan et al., arguing that
limitations of their simulations in~\cite{smp18} have lead them to over-estimate the severity of an attack. Using a
model tailored to accurately represent the grid's protection mechanisms, they found that due to the action of protection
systems such as load shedding and over frequency protection, large attacks of 30\% of total grid power are likely to
cause only localized blackouts and the decay of the grid into islands, instead of a large-scale blackout. Smaller attack
sizes between 1\% and 10\% were mostly harmless in their simulations.

From literature, we get the overall impression that both IoT and Smart Grid security are challenging. Both lack behind
the security standard of state of the art desktop, server and smartphone operating systems. Reasons for this are the
relatively recent nature of the IoT software ecosystem and the large number of independent implementations. A unique
challenge to Smart Grid security is that due to the fragmentation of markets along national borders, certain devices
such as smart meters or DSR implementations exist in large monocultures.

Smart meters are consumer devices built down to a price and manufacturers' firmware security R\&D budgets are limited by
the high degree of market fragmentation that is caused by mutually incompatible national smart metering standards.
Landis+Gyr, a large utility meter manufacturer, state in their 2019 annual report that they invested \SI{36}{\percent}
of their total R\&D budget on embedded software while spending only \SI{24}{\percent} on hardware
R\&D~\cite{landisgyr01,landisgyr02}, which indicates tension between firmware security and the manufacturers's bottom
line. 

Compared to IoT and Smart Grid devices, the embedded firmware foundations of modern smartphones have received more
attention both from the industry and from academia. Pinto and Santos in~\cite{pinto01} conducted a survey of
implementations based on ARM's TrustZone embedded virtualization architecture and found a significant number of reported
vulnerabilities across different implementations. For instance, Rosenberg in~\cite{rosenberg01} found critical issues in
Qualcomm's QSEE hypervisor, and Kanonov and Wool in~\cite{kanonov01} identified a number of design weaknesses and
security vulnerabilities in Samsung's competing KNOX virtualization product. To us, the state of the field of embedded
security indicates that even if significant effort is spent on the security of IoT and Smart Grid devices to catch up
with desktop, server and smartphone security, significant vulnerabilities are likely to remain for some time to come.
In this instance, market forces do not align with the interest of the public at large. Vulnerabilities remain likely,
especially in code implementing complex network protocols such as TLS~\cite{georgiev01}, which may even be mandated by
national standards in some devices such as smart meters.

%\subsection{Reliably resetting an IoT or Smart Grid device}

\subsection{Oscillations in the electrical grid}

Common to the attacks on the electrical grid proposed in the papers discussed above is their approach of overloading
parts of the grid. However, scenarios have been proposed that go beyond a simple overload condition, in which an
attacker instead carefully exploits the physical characteristics of the grid to cause oscillations of increasing
amplitude, ultimately triggering a cascade of protection mechanisms. The purpose of this type of attack is to use a
small controllable load to cause outsized damage.

Electro-mechanical oscillation modes between different geographical areas of an electrical grid are a well-known
phenomenon. In their book~\cite{rogers01}, Rogers and Graham provide an in-depth analysis of these oscillations and
their mitigation. In~\cite{grebe01}, Grebe, Kabouris, López Barba et al.\ analyzed modes inherent to the
continental European grid. A report on an event where an oscillation on one such mode caused a problem can be found in
\cite{entsoe01}.

In~\cite{zlmz+21}, Zou, Liu, Ma et al.\ analyzed the possibility of a modal attack in which electric vehicle chargers
rapidly modulate their power to force an oscillation of a poorly dampened wide-area electromechanical mode. In their
model an attacker compromises a backend smart grid control system that controls a large number of EV chargers. Using
mathematical analysis, small-scale simulations and limited practical experiments they validated the attack scenario and
developed a countermeasure that can be implemented as part of generator control systems and that when activated can
suppress forced oscillations of wide-area electromechanical modes.

\subsection{Proposed Countermeasures}

In parallel with research on theoretical attacks, countermeasures to these have also been proposed in academic
literature. In~\cite{kgma21}, the authors propose an extension to grid control algorithms aimed at increasing the grid's
robustness towards forced oscillations. In~\cite{smp18}, the authors propose that utility operators use a detailed
attacker model to engineer additional safety margins into the grid while minimizing the economic inefficiency of these
measures. On the IoT side, they note that due to the wide implementation diversity, the problem cannot be solved by
individual measures and propose additional fundamental research on IoT device security.

In~\cite{hcb19}, the authors conclude that simple demand attacks where compromised loads suddenly increase demand are
adequately mitigated by existing safety measures, in particular \emph{Under-Frequency Load Shedding} (UFLS), which forms
the basis of any grid's automatic emergency response. As part of UFLS, during a contingency the utility will
progressively disconnected loads according to set priorities until the production / generation balance has been restored
and a blackout has been averted.

% FIXME more sources!

\section{Grid Frequency as a Communication Channel}

The countermeasures discussed above are fully automatic. Such systems can provide a good first line of defense, but they
must be complemented by means of manual intervention since not every eventuality can be anticipated. During a
large-scale cyber attack, availability of internet and cellular connectivity cannot be relied upon. An attacker may
already have disabled such systems in a separate attack, or they may go down along with parts of the electrical grid.
Powerline communication systems will likely be unaffected by an attack, but at a range of no more than several tens of
kilometers, covering the entire grid would require a large upfront infrastructure investment for transmitters.

We propose to approach the problem of broadcasting an emergency control signal to all grid-connected devices such as
smart meters or IoT appliances within a synchronous area by using grid frequency as a communication channel.  Despite
the technological complexity of the grid, the physics underlying its response to changes in load and generation is
surprisingly simple. Individual machines (loads and generators) can be approximated by a small number of differential
equations describing their control systems' interaction with the machines' physics, and the entire grid can be modelled
by aggregating these approximations into a large system of differential equations. As a consequence, small signal
changes in generation/consumption power balance cause an approximately proportional change in
frequency~\cite{kundur01,crastan03,entsoe02,entsoe04}. The slope of this first-order approximation is known as
\emph{Power Frequency Characteristic}, and in case of the continental European synchronous area happens to be about
\SI{25}{\giga\watt\per\hertz}  according to the European electricity grid authority, ENTSO-E.

If we modulate the power consumption of a large load, this modulation will result in a small change in frequency
according to that characteristic. As long as we stay within the operational limits set by
ENTSO-E~\cite{entsoe02,entsoe03}, this change will not degrade the operation of other parts of the grid. The advantages
of grid frequency modulation are the fact that a single transmitter can cover an entire synchronous area as well as low
receiver hardware complexity.

To the best of the authors' knowledge, grid frequency modulation has only ever been proposed as a communication channel
at very small scales in microgrids before~\cite{urtasun01} and has not yet been considered for large-scale application.

\subsection{Comparison to other communication channels}

Compared to traditional channels such as Fiber To The Home (FTTH), 5G or LoraWAN, grid frequency as a communication
channel has a resiliency advantage. It can start transmission as soon as a power island with a connected transmitter is
powered up, while communication networks such as FTTH or 5G are still rebooting or waiting for their centralized
infrastructure to come back online. Mesh networks such as LoraWAN can cover short distances up to $\SI{20}{\kilo\meter}$
without requiring infrastructure to be available, but for longer distances LoraWAN relies on the public internet for its
network backbone. Additionally, systems such as FTTH, 5G and LoraWAN are built around a point-to-point communication
model and usually do not support a global broadcast primitive. During times when a large number of devices must be
reached simultaneously this can lead to congestion of cellular towers and servers. Therefore, during an ongoing cyber
attack, grid frequency is promising as a communication channel because only a single transmitter facility must be
operational for it to function, and this single transmitter can reach all connected devices simultaneously.

\subsection{Characterizing Grid Frequency}
\label{grid-freq-characterization}

To prepare our analysis of grid frequency modulation, we developed a device that allows us to collect measurements of
actual grid frequency behavior through safely recording the grid voltage waveform.  Our system consists of an
\texttt{STM32F030F4P6} ARM Cortex M0 microcontroller that records mains voltage using its internal 12-bit ADC and
transmits measured values through a galvanically isolated USB/serial bridge to a host computer. We derive our system's
sampling clock from a crystal oven to avoid frequency measurement noise due to thermal drift of a regular crystal:
\SI{1}{ppm} of crystal drift would cause a grid frequency error of $\SI{50}{\micro\hertz}$. We compared our
oven-stabilized clock against a GPS 1 pps reference and found that over a time span of 20 minutes both stayed stable
within 5 ppb of each other, which corresponds to the drift specification of a typical crystal oven.

In utility SCADA systems, Phasor Measurement Units (PMUs) are used to precisely measure grid frequency among other
parameters.  Details on the inner workings of commercial phasor measurement units are scarce but there is a large amount
of academic research on their measurement algorithms.  PMUs employ complex signal analysis algorithms to provide fast
and precise measurements even when given a heavily distorted input signal~\cite{narduzzi01,derviskadic01,belega01}.

In our application, we do not need the same level of precision. For the sake of simplicity, we use the universal
frequency estimation approach of Gasior and Gonzalez~\cite{gasior01}. In this algorithm, the windowed input signal is
processed using a Discrete Fourier Transform (DFT), then the signal's fundamental frequency is interpolated by fitting a
wavelet to the largest peak in the DFT result. The bias parameter of this curve fit is an accurate estimation of the
signal's fundamental frequency. This algorithm is similar to the interpolated DFT algorithm referenced by phasor
measurement literature~\cite{borkowski01}.

\begin{figure}
    \centering
    \includegraphics[width=0.45\textwidth]{../notebooks/fig_out/freq_meas_spectrum_new}
    \caption{The spectrum of grid frequency variations measured over 24 hours. The raw spectrum is shown in gray, and a
    smoothed spectrum is shown in red. The blue line is inversely proportional to frequency and illustrates the $1/f$
    nature of the spectrum. Distinctive peaks in the spectrum are marked with red crosses, and their locations
    are given on the bottom of the diagram.}
    \Description{A plot of power spectral density in Hertz squared per Hertz versus period in seconds. The plot shows
    the measured spectrum, a smoothed fit of the measured spectrum, and an one over f line for comparison. The measured
    spectrum is very noisy. The smoothed signal looks much cleaner, and roughly follows the one over f line. The
    smoothed data contains several notable features. At a period of about 80 seconds, its slope suddenly starts falling
    off faster than one over f to form a through shape towards higher frequencies. There are several narrow bumps at
    round number periods such as 10 seconds, 60 seconds, 300 seconds and 900 seconds. There are three wider bumps
    visible. Two, a larger and a smaller one, next to each other centered on 4.7 seconds for the larger one and 7.0
    seconds for the smaller one. The last wider bump is below 0.5 seconds.}
    \label{fig_freq_spec}
\end{figure}

Using our grid frequency recorder, we performed a two-day measurement series of grid frequency.
Figure~\ref{fig_freq_spec} shows the frequency spectrum of grid frequency over this two-day span. In this spectrum, we
observe a number of features. Across the frequency range, we observe a broad $1/f$ noise. Above a period of
$\SI{10}{\second}$, this $1/f$ noise dips to a flat noise floor. We estimate that this low-noise region is caused by the
self-regulating effect of loads. Above a $\SI{10}{\second}$ period, primary control is activated and thus the $1/f$
noise we observe is the result of the interaction between primary control and consumer demand. On top of this $1/f$
behavior, the spectrum shows several sharp peaks at time intervals with a ``round'' number such as $\SI{10}{\second}$,
$\SI{60}{\second}$ or multiples of $\SI{300}{\second}$. These peaks are due to loads turning on- or off depending on
wall-clock time, and demand forecasting not being able to precisely match the amplitude of these large changes in load.
Besides the narrow peaks caused by this effect we can also observe two wider bumps at $\SI{7.0}{\second}$ and
$\SI{4.7}{\second}$. These bumps closely correlate with continental European synchonous area's oscillation modes at
$\SI{0.15}{\hertz}$ (east-west) and $\SI{0.25}{\hertz}$ (north-south)~\cite{grebe01}.

\section{Grid Frequency Modulation}

A transmitter for grid frequency modulation would be a controllable load of several Megawatt that is located centrally
within the grid. A baseline implementation would be a spool of wire submerged in a body of cooling liquid (such as a
small lake) which is powered from a thyristor rectifier bank. Compared to this baseline solution, hardware and
maintenance investment can be decreased by repurposing a large industrial load as a transmitter. Going through a list of
energy-intensive industries in Europe~\cite{ec01}, we found that an aluminium smelter would be a good candidate. In
aluminium smelting, aluminium is electrolytically extracted from alumina solution. High-voltage mains power is
transformed, rectified and fed into approximately 100 series-connected electrolytic cells forming a \emph{potline}.
Inside these pots, alumina is dissolved in molten cryolite electrolyte at approximately \SI{1000}{\degreeCelsius} and
electrolysis is performed using a current of tens or hundreds of Kiloampère at a few Volt per cell. The resulting pure
aluminium settles at the bottom of the cell and is tapped off for further processing.

Aluminium smelters are operated around the clock, and due to the high financial stakes their behavior under power
outages has been carefully characterized. Power outages of tens of minutes up to two hours reportedly do not cause
problems in aluminium potlines~\cite{eisma01,oye01}. Recently, even techniques for intentional power modulation without
affecting cell lifetime or product quality have been developed to take advantage of variable energy
prices~\cite{duessel01,eisma01,depree01}.  An aluminium plant's power supply is controlled to constantly keep all
smelter cells under optimal operating conditions. Modern power supply systems employ large banks of diodes or thyristors
to rectify low-voltage AC to DC to be fed into the potline~\cite{ayoub01}. Potline voltage is controlled through a
combination of a tap changer and a transductor.  Individual cell voltages are controlled by changing the physical
distance between anode and cathode.  In this setup, power can be electronically modulated using the thyristor rectifier.
Since the system does not have any mechanical inertia, high modulation rates are possible.

In~\cite{depree01}, the authors describe a setup where a large Aluminium smelter in continental Europe is used as
primary control reserve for frequency regulation. Their system achieved a rise time of $\SI{15}{\second}$, meeting the
local $\SI{30}{\second}$ requirement for primary control. The authors calculated that their system can provide an
equivalent thermal energy storage capacity of $\SI{7.7}{\giga\watt\hour}$ using all plants of a single operator. At the
maximum modulation depth of $\SI{100}{\percent}$ for up to one hour that the paper cites, the resulting effective
modulation power is $\SI{7.7}{\giga\watt}$. Over a longer time span of $\SI{48}{\hour}$, they have demonstrated a
$\SI{33}{\percent}$ modulation depth which would correspond to a modulation power of $\SI{2.5}{\giga\watt}$. The
experiment from~\cite{depree01} shows that a modulation of part of an aluminium smelter's power consumption is possible
at no significant production impact and at low infrastructure cost. Aluminium smelters are already connected to the grid
in a way that they do not pose a danger to other nearby consumers when they turn off or on parts of the plant, as this
is commonplace during routine maintenance activities.

\subsection{Operating a GFM safety reset}

While a single large Aluminium smelter could conceivably provide sufficient modulation power to cover the entire
continental European synchronous area, we have to consider operation during a black start, when the grid temporarily
divides into a number of disconnected power islands. A single transmitter would only be able to reach receivers on the
same power island.

To alleviate this constraint, a number of smaller transmitters throughout the network can be synchronized to act in
unison. Using existing industrial loads keeps the implementation cost of additional transmitters low. GPS-disciplined
frequency standards can keep transmissions synchronized across power islands even after a holdover period of several
days. When the utility rejoins power islands into the larger grid, the synchronized transmissions will constructively
interfere.

As illustrated in Figure~\ref{fig_intro_flowchart}, the transmitters are connected to a command center. For this
connection, a redundant set of long-range radio or satellite links can be used, as well as wired connections through the
utility's dedicated SCADA network. In an emergency, the command center can then trigger a transmission. Synchronized
through their gps-backed frequency standards, two transmitters will then constructively interfere as soon as they are
connected to the same power island.

\subsection{Parameterizing Modulation for GFM}

Given the grid characteristics we measured using our custom waveform recorder and using a model of our transmitter, we
can derive parameters for the modulation of our broadcast system.  The overall network power-frequency characteristic of
the continental European synchronous area is approximately $\SI{25}{\giga\watt\per\hertz}$~\cite{entsoe02}. Thus, the
main challenge for a GFM system will be poor signal-to-noise ratio (SNR) due to low transmission power. A second layer
of modulation yielding some modulation gain beyond the basic amplitude modulation of the transmitter will be necessary
to achieve sufficient overall SNR.

The grid's frequency noise has significant localized peaks that might interfere with this modulation. Further
complicating things are the oscillation modes. A GFM system must be designed to avoid exciting these modes. However,
since these modes are not static, a modulation method that is designed around a specific assumption of their location
would not be future proof. Given these concerns, the optimal second-level modulation technique for GFM is a
spread-spectrum technique. By spreading signal energy throughout a wide band, both the impact of local noise spikes is
minimized and the risk of mode excitation is reduced since spread-spectrum techniques minimize energy in any particular
sub-band.

The spread-spectrum technique that we chose is Direct Sequence Spread Spectrum for its simple implementation and good
overall performance. DSSS chip timing should be as fast as the transmitter's physics allow to exploit the low-noise
region between $\SI{0.2}{\hertz}$ to $\SI{2.0}{\hertz}$ in Figure~\ref{fig_freq_spec}. Going past
$\approx\SI{2}{\hertz}$ would complicate frequency measurement at the receiver side.

\subsubsection{Direct Sequence Spread Spectrum (DSSS) modulation}

Direct Sequence Spread Spectrum modulation is a common spread-spectrum technique that forms the basis of a number of
radio systems, most prominently all global navigation satellite systems (GNSS). As a spread-spectrum technique, DSSS
spreads out the signal's energy across a broad spectral range. This decreases the susceptibility of a DSSS signal to
narrowband interference. In GNSS, this allows the rejection of other nearby RF sources. In our use case, this makes the
signal immune to the many narrow peaks in the grid frequency's noise spectrum that are caused by control systems
sychronized to wall-clock time(cf.~Fig.~\ref{fig_freq_spec}). In addition to better interference immunity, DSSS has two
other important characteristics: It provides \emph{modulation gain}, i.e.~it allows a trade-off between data rate and
receiver sensitivity, and it allows for Code Division Multiple Access (CDMA). In CDMA, multiple DSSS-modulated signals
can be sent simultaneously through a shared channel with less impact to the resulting signal-to-noise ratio (SNR) than
would be the case for other modulation techniques.

A DSSS signal is made up from pseudo-random \emph{symbols}, which in turn are made up from individual physical layer
bits called \emph{chips}. Chips are encoded in the signal using a lower-layer modulation such as phase-shift keying
(e.g.~in GPS) or frequency-shift keying (in this work). In DSSS, a \emph{code} is a library of symbols that are
constructed to have minimal cross-correlation, i.e.\ they are near-orthogonal. A transmitter sends a symbol by
transmitting its particular pseudo-random chip sequence at a chosen polarity, conveying one bit of information. A
receiver demodulates the signal by directly correlating the incoming physical-layer signal with the symbol's chip
pattern, which results in a positive or negative peak when a symbol is received depending on its polarity.

By increasing the DSSS sequence length by a factor of $2$, SNR is improved by $\sqrt{2}$ assuming an additive white
gaussian noise (AWGN) channel. At the same time, when doubling the sequence length, common DSSS code construction
methods provide twice the number of distinctive symbols allowing for twice the number of CDMA participants. The trade
off between twice the sequence length (and transmission time) for approximately $\SI{1.5}{dB}$ in SNR is a steep
trade-off, but is necessary in systems where transmitter power cannot be increased further and the resulting signal has
a marginally low SNR.

\subsubsection{DSSS parametrization}

To find the parameters for our DSSS modulation, we simulated a proof-of-concept modulator and demodulator using data
captured from our grid frequency sensor. Our simulations covered a range of combinations of modulation amplitude, DSSS
sequence bit depth, chip duration and detection threshold. Figure~\ref{fig_ser_nbits} shows our simulation results for
symbol error rate (SER) as a function of modulation amplitude with Gold sequences of several bit depths. From these
graphs we conclude that the range of practical modulation amplitudes starts at approximately $\SI{1}{\milli\hertz}$,
which corresponds to a modulation power of approximately $\SI{25}{\mega\watt}$~\cite{entsoe02}.
Figure~\ref{fig_ser_thf} shows SER against detection threshold relative to background noise. Figure~\ref{fig_ser_chip}
shows SER against chip duration for a given fixed symbol length. As expected from looking at our measured grid frequency
noise spectrum, performance is best for short chip durations and worsens for longer chip durations since shorter chip
durations move our signals' bandwidth into the lower-noise region from $\SI{0.2}{\hertz}$ to $\SI{2}{\hertz}$.
%FIXME introduce term "chip" somewhere

\begin{figure}
    \centering
    \includegraphics[width=0.3\textwidth]{../notebooks/fig_out/dsss_gold_nbits_overview}
    \caption{Symbol Error Rate as a function of modulation amplitude for Gold sequences of several lengths.}
    \Description{A plot of symbol error rate versus amplitude in millihertz. The plot shows four lines, one each for 5
    bit, 6 bit, 7 bit and 8 bit. All four lines form smooth step functions, plateauing at a symbol error rate of 1.0 for
    low amplitudes and falling to a symbol error rate of 0.0 for high amplitudes. The low-amplitude plateau is widest
    for 5 bit and narrowest for 8 bit. The falloff is steepest for 8 bit, and slowest for 5 bit. For 8 bit, a symbol
    error rate of 0.5 is crossed at about 0.4 millihertz. For 7 bit at about 0.6 millihertz, for 6 bit at 0.8 millihertz
    and for 5 bit at 1.3 millihertz. For 7 and 8 bit, symbol error rate settles at zero above 1.0 millihertz. For 5 bit
    above 2.0 millihertz and for 8 bit at about 3.0 millihertz.
    }
    \label{fig_ser_nbits}
\end{figure}

\begin{figure}
    \centering
    \hspace*{-5mm}\includegraphics[width=0.5\textwidth]{../notebooks/fig_out/dsss_thf_amplitude_5678}
    \vspace*{-5mm}
    \caption{SER vs.\ Amplitude and detection threshold. Detection threshold is set as a factor of background noise
    level.}
    \Description{This figure shows four plots that are similar to the previous figure. Each plot shows symbol error rate
    plotted against signal amplitude in millihertz. Each of the four plots shows a different gold sequence length, from
    5 bit up to 8 bit. Each plot contains more than ten traces that are color-coded for a different detection threshold
    factor. All plots show that a high threshold factor going towards 10 shifts the symbol error rate curve towards
    higher amplitudes, implying a less sensitive receiver. For lower threshold factors the sensitivity improves,
    however, for very low threshold factors performance deterioates and the plotted curves suddenly become completely
    erratic, with several curves for low threshold factors around 2 at all bit lengths never reaching symbol error rates
    below 0.2. The middle ground between the two seems to be a threshold factor of around 5. The four plots show a clear
    dependency between receiver sensitivity and gold code length. For a 5 bit gold code, only a few graphs settle at all
    and those that do settle towards zero symbol error rate only between 3 and 4 millihertz in amplitude. For a 6 bit
    gold sequence, most graphs settle, and for the best threshold factor the graph settles to zero symbol error rate
    below 2 millihertz amplitude. For the 7 bit gold code, the best graph settles at approximately 1.2 millihertz, and
    for the 8 bit gold code at approximately 0.8 millihertz.}
    \label{fig_ser_thf}
\end{figure}

\begin{figure}
    \centering
    \hspace*{-5mm}\includegraphics[width=0.5\textwidth]{../notebooks/fig_out/chip_duration_sensitivity_6}
    \vspace*{-5mm}
    \caption{SER vs.\ DSSS chip duration.}
    \Description{The figure shows two plots. The first plot shows symbol error rate against signal amplitude in
    millihertz, but this time it shows a cohort of curves for different chip durations. The general amplitude behavior
    is similar to the previous figure showing threshold factor instead, with a plateau at a 1.0 symbol error rate for
    low amplitudes, and a smooth step settling to a 0.0 symbol error rate for large signal amplitude. The plot shows
    chip durations between 0.1 seconds, equivalent to 6.4 seconds symbol duration and 5.0 seconds, equivalent to 320
    seconds symbol duration. Most curves settle within the plotted range of 0 to 5 millihertz. Larger chip durations
    settle only at higher amplitudes, and the fastest settling chip durations are also the shortest. There is a cluster
    of fast-settling curves settling around 1.0 millihertz amplitude for chip durations below 1.0 seconds. A clear best
    candidate is hard to distinguish from this cluster.
    The second plot in the figure shows the minimum amplitude necessary for a symbol error rate of 0.5 plotted in
    millihertz against chip duration in seconds. The graph shows a nicely round curve bottoming out at approximately
    0.75 millihertz for a chip duration of 0.3 seconds. For lower chip durations, the curve slightly rises, while for
    longer chip durations it rises by a lot, reaching 4.0 millihertz for a chip duration of 5.0 seconds.}
    \label{fig_ser_chip}
\end{figure}

\subsection{Parameterizing a PoC GFM ``Safety Reset''}

%FIXME introduce scenario
Taking these modulation parameters as a starting point, we proceeded to create a proof-of-concept smart meter emergency
reset system. On top of the modulation described in the previous paragraphs we layered simple Reed-Solomon error
correction~\cite{mackay01} and some cryptography. The goal of our PoC cryptographic implementation was to allow the
sender of an emergency reset broadcast to authorize a reset command to all listening smart meters. An additional
constraint of our setting is that due to the extremely slow communication channel all messages should be kept as short
as possible. The solution we chose for our PoC is a simplistic hash chain using the approach from the Lamport and
Winternitz One-time Signature (OTS) schemes~\cite{lamport02,merkle01}. Informally, the private key is a random
bit string. The public key is generated by recursively applying a hash function to this key a number of times. Each
smart meter reset command is then authorized by disclosing subsequent elements of this series. Unwinding the hash chain
from the public key at the end of the chain towards the private key at its beginning, at each step a receiver can
validate the current command by checking that it corresponds to the previously unknown input of the current step of the
hash chain. Replay attacks are prevented by the device memorizing the most recent valid command.  This simple scheme
does not afford much functionality but it results in very short messages and removes the need for computationally
expensive public key cryptography inside the smart meter.

Formally, we can describe our simple cryptographic protocol as follows. Given an $m$-bit cryptographic hash function $H
: \{0,1\}^*\rightarrow\{0,1\}^m$ and a private key $k_0 \in \{0,1\}^m$, we construct the public key as
$k_{n_\text{total}} = H^{n_\text{total}}(k_0)$ where $H^n(x)$ denotes the $n$-fold recursive application of $H$ to
itself, i.e.\ $H(H(\hdots H(x)))$. $n_\text{total}$ is the total number of signatures that the system can
issue over its lifetime. $n_\text{total}$ must be chosen with adequate safety margin to account for unpredictable future
use of the system. The choice of $n_\text{total}$ is of no consequence when a device checks reset authorization, but key
generation time grows linearly with $n_\text{total}$ since $H$ needs to applied $n_\text{total}$ times. In practice,
given the speed of modern computers, values of $n_\text{total} > 10^9$ should pose no problem during key generation. For
public key $k_{n_\text{total}}$, the system can authorize up to $n_\text{total}$ commands by successively disclosing the
$k_i$ starting at $i=n-1$ and counting down until finally disclosing $k_0$. Since we only want to transmit a single bit
of information, we do not need any payload. Instead, we simply send a message $m =  (k_i)$ consisting solely of $k_i$.
The receiver of a message $m$ can check that the message is a legitimate command by checking $\exists i<q: H^i(m) =
k_\text{last}$ where $k_\text{last}$ is the last valid command that was received. $q$ is the maximum lookup depth that
the device will accept as valid. To conserve processing power, $q$ should be chosen to be much smaller than
$n_\text{total}$. Choosing $q$ too small, a device might become out of sync with the transmitter when it is disconnected
from the electrical grid for a long enough time for at least $q$ commands to be issued in the meantime. In practice,
this should not be a concern since only few commands should be issued over the life time of the system.

During an emergency situation, not all safety reset controllers might be online at the same time. In case the electrical
grid is restored piece by piece with safety reset controllers coming back online in batches, an utility might repeatedly
transmit the same reset command. In our protocol, we handle this situation by memorizing the last valid received command
on the device side, and only acting \emph{once} when a new command is received. The transmission of one command thus
becomes idempotent, and the utility can repeat the command until sufficiently many devices have received the command and
performed a safety reset.

In our protocol, we define two commands, \emph{reset} and \emph{disarm}. We assign \emph{reset} and \emph{disarm} to the
$k_i$ in an alternating way. For odd $i$, $k_i$ is a reset command and for even $i$, $k_i$ is a \emph{disarm} command.
To trigger a safety reset, the utility transmits the next unused $k_{2i+1}$. The utility may transmit this command
repeatedly to also reset devices that have come online only after earlier transmissions have started. After a sufficient
number of devices have performed a safety reset, the utility then transmits the next disarm command, $k_{2i}$. When
devices receive the disarm command, they still update the last received command, but they do not perform any other
action. The initial private key, $k_0$, is a \emph{disarm} key.

The reason for interleaving two commands in this way is to prevent a specific attack scenario in which an attacker first
observes a safety reset command being transmitted, and then at a later time gains access to a large load that could act
as a grid frequency modulation transmitter. Without a \emph{disarm} command, this attacker could then later trigger a
safety reset in any device that has not received the original reset command yet. The \emph{disarm} command gives the
utility the option to revoke a prior \emph{reset} command before any devices that were offline during the original reset
without triggering them to reset.

% FIXME add more precise/formal description of crypto
% FIXME add description of targeting/scope function?
% FIXME somewhere above descirbe entire reset system architecture????!!!
% FIXME add description of disarm message (replay protection)

\subsection{Experimental results}

\begin{figure}
    \centering
    \includegraphics[width=0.35\textwidth]{prototype.jpg}
    \caption{The completed prototype setup. The board on the left is the safety reset microcontroller. It is connected
    to the smart meter in the middle through an adapter board. The top left contains a USB hub with debug interfaces to
    the reset microcontroller. The cables on the bottom left are the debug USB cable and the \SI{3.5}{\milli\meter}
    audio cable for the simulated mains voltage input.}
    \Description{A photo of the safety reset prototype. Visible is a stand made from plywood to which a smart meter is
    mounted in the middle. To one side of the smart meter a light switch and a socket are connected. To the other side,
    an orange power cable exits towards the back of the stand. The smart meter is connected to a prototype circuit board
    with colorful wires. The prototype circuit board is in turn connected to a microcontroller development board. The
    development board is connected to a USB hub with both an SWD programming adapter and a USB to serial converter. A
    usb cable from the USB hub as well as a 3.5 millimeter audio cable from the prototype circuit board are neatly
    coiled up and hang down from the stand.}
    \label{fig_proto_pic}
\end{figure}

For a realistic proof of concept, we decided to implement our signal processing chain from DSSS demodulator through
error correction up to our simple cryptography layer in microcontroller firmware and demonstrate this firmware on actual
smart meter hardware, shown in Figure~\ref{fig_proto_pic}. In our proof of concept a safety reset controller is
connected to the main application microcontroller of a smart meter. The reset controller is tasked with listening for
authenticated reset commands on the voltage waveform, and on reception of such a command resetting the smart meter
application controller by flashing a known-good firmware image to its memory.

For our proof of concept, before settling on the commercial smart meter we first tried to use an \texttt{EVM430-F6779}
smart meter evaluation kit made by Texas Instruments. This evaluation kit did not turn out well for two main reasons.
One, it shipped with half the case missing and no cover for the high-voltage terminal blocks. Because of this some work
was required to get it electrically safe.  The second issue we ran into was that the development board is based around a
specific microcontroller from TI's \texttt{MSP430} series that is incompatible with common JTAG programmers.

Our initial assumption that a development kit would be easier to program than a commercial meter did not prove to be
true. Contrary to our expectations the commercial meter had JTAG enabled allowing us to easily read out its stock
firmware requiring neither reverse-engineering vendor firmware update files nor circumventing code protection measures.
The fact that its firmware was only available in its compiled binary form was not much of a hindrance as it proved not
to be too complex and all we wanted to know we found with just a few hours of digging in
Ghidra\footnote{\url{https://ghidra-sre.org/}}.

The signal processing chain of our PoC is shown in Figure~\ref{fig_demo_sig_schema}. To interoperate with existing
implementations of SHA-512 and reed-solomon decoding, this implementation was written in the C programming language. To
demonstrate an application close to a field implementation, we chose an Easymeter \texttt{Q3DA1002} smart meter as our
reset target. This model is popular in the German market and readily available second-hand. The meter consists of three
isolated metering ASICs connected to a data logging and display PCB through infrared optical links. To demonstrate the
safety reset's firmware reset functionality, we connected our safety reset microcontroller to the Texas Instruments
\texttt{MSP430} microcontroller on the meter's display and data logging board through the JTAG debug interface that the
board's vendor had conveniently left accessible. We ported part of
\texttt{mspdebug}\footnote{\url{https://dlbeer.co.nz/mspdebug/}} to drive the meter microcontroller's JTAG interface and
wrote a piece of demonstrator code that overwrites the meter's firmware with one that displays an identifying string on
the meter's display after boot-up.

\begin{figure}
    \centering
    \includegraphics[width=0.45\textwidth]{prototype_schema}
    \caption{The signal processing chain of our demonstrator.}
    \Description{A diagram showing the signal processing flow. The diagram shows a number of steps going from grid
    voltage waveform to trigger decision. The diagram begins with the DMA-assisted ADC capture. At this point, the
    signal is a clean analog sine wave. The next step is grid frequency estimation, after which the signal is a
    noise-like ragged line. After grid frequency estimation follows DSSS demodulation, which itself is made up of three
    steps. The first step of DSSS demodulation is convolution, which produces a small noise signal with a large peak
    somewhere in the middle. The peak is roughly ten times the amplitude of the noise and has two prominent negative
    side lobes to the left and right. The following step, CWT peak contrast enhancement, cleans up this signal and
    removes the side-lobes leaving only the positive peak sticking out of the background noise. The final step of DSSS
    demodulation is maximum likelihood estimation, which produces a vector of n plus k discrete elements. After DSSS
    demodulation, this vector is passed through Reed-Solomon error correction, which transforms it into a vector of now
    only n discrete elements. This vector is then finally processed in the cryptographic trigger protocol, which
    produces the final trigger decision.}
    \label{fig_demo_sig_schema}
\end{figure}

To measure grid frequency in our demonstrator, we ported the same code we used in
Section~\label{grid-freq-characterization} to our demonstrator, again using the voltage measured using the
microcontroller's internal ADC but using a regular crystal instead of a crystal oven for the microcontroller's system
clock.  We decided to feed our proof-of-concept reset controller with an emulated grid voltage sine wave from a
computer's headphone output. Where in a real application this microcontroller would  take ADC readings of input mains
voltage divided down by a long resistive divider chain, we instead feed the ADC from a $\SI{3.5}{\milli\meter}$ audio
input. For operational safety, we disconnected the meter microcontroller from its grid-referenced capacitive dropper
power supply and connected it to our reset controller's debug USB power supply.

In the firmware development phase of our proof of concept, we tested every module such as DSSS demodulator, Reed-Solomon
decoder, or grid frequency estimation individually. This approach proved very useful for debugging. The modular
architecture allowed us to directly compare our demodulator implementation to our Jupyter/Python prototype, where we
found that our C implementation outperformed the Python prototype. Despite the algorithms's complexity, the
microcontroller C implementation has no issues processing data in real-time due to the low sampling rate necessary.

We performed several successful experiments using a signature truncated at 120 bit and a 5 bit DSSS sequence. Taking the
sign bit into account, the length of the encoded signature is 20 DSSS symbols. On top of this we used Reed-Solomon error
correction at a 2:1 ratio inflating total message length to 30 DSSS symbols. At the \SI{1}{\second} chip rate we used in
other simulations as well this equates to an overall transmission duration of approximately \SI{15}{\minute}. To give
the demodulator some time to settle and to produce more realistic conditions of signal reception we padded the modulated
signal with unmodulated noise on both ends.

\subsection{Discussion}

During an emergency in the electrical grid, the ability to communicate to large numbers of end-point devices is a
valuable tool for restoring normal operation. When a resilient communication channel is available, loads such as smart
meters and IoT devices can be equipped with a supervisor circuit that allows for a remote ``safety reset'' that puts the
device into a safe operating state. Using this safety reset, an attacker that uses compromised smart meters or IoT
devices to attack grid stability can be interrupted before the can conclude their attack. During recovery from an
outage, a safety reset can be used to reduce stress on the system during a black start by temporarily disabling
non-essential loads such as air conditioners.

The safety reset controller does not require any peripherals except for an ADC. Thus we expect code size to be the main
factor affecting per-unit cost in an in-field deployment of our concept. At around \SI{64}{\kilo\byte}, our demonstrator
firmware implementation is viable on low-end microcontrollers. Given that modern smart meters and IoT devices usually
use complex Systems on Chip (SoCs), a safety reset controller could be integrated into the main application processor
itself at little added complexity. In summary, we expect safety reset controllers to be commercially viable.

Safety reset controllers can be adapted to most IoT device and smart meter designs. Because they are independent from
other public utilities such as the internet or cellular networks, we believe in their potential as a last line of
defense providing resilience under large-scale cyber attacks. The next steps towards a practical implementation will be
a practical demonstration of broadcast data transmission through grid frequency modulation using a megawatt-scale
controllable load as well as further optimization of the modulation and data encoding and the demodulator
implementation.

\section{Conclusion}
\label{sec_conclusion} 

In this paper we have developed an end-to-end design for a safety reset system that provides these capabilities.
Our novel broadcast data transmission system is based on intentional modulation of global grid frequency. Our system is
independent of normal communication networks and can operate during a cyber attack. We have shown the practical
viability of our end-to-end design through simulations. Using our purpose-designed grid frequency recorder, we can
capture and process real-time grid frequency data in an electrically safe way. We used data captured this way as the
basis for simulations of our proposed grid frequency modulation communication channel. In these simulations, our system
has proven feasible. From our simulations we conclude that a large consumer such as an aluminium smelter at a small cost
can be modified to act as an on-demand grid frequency modulation transmitter.

We have demonstrated our modulation system in a small-scale practical demonstration.  For this demonstration, we have
developed a simple cryptographic protocol ready for embedded implementation in resource-constrained systems that allows
triggering a safety reset with a response time of less than 30 minutes.  In this demonstration we use simulated grid
frequency data to trigger a commercial microcontroller to perform a firmware reset of an off-the-shelf smart meter. The
next step in our evaluation will be to conduct an experimental evaluation of our modulation scheme in collaboration with
an utility and an operator of a multi-megawatt load.  

\appendix
\section{Artifacts}

Source code for the demonstrator and simulations, as well as hardware EDA designs are available at the public git
repository at the following URL:

\begin{center}
    \url{https://git.jaseg.de/safety-reset.git}
\end{center}

\begin{acks}
    This work has been co-funded by the LOEWE initiative (Hesse, Germany) within the emergenCITY center.
\end{acks}

\bibliographystyle{ACM-Reference-Format}
\bibliography{\jobname}

\end{document}