1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
|
\documentclass[12pt,a4paper,notitlepage]{report}
\usepackage[utf8]{inputenc}
\usepackage[a4paper,textwidth=17cm, top=2cm, bottom=3.5cm]{geometry}
\usepackage[T1]{fontenc}
\usepackage[
backend=biber,
style=numeric,
natbib=true,
url=true,
doi=true,
eprint=false
]{biblatex}
\addbibresource{safety_reset.bib}
\usepackage{amssymb,amsmath}
\usepackage{listings}
\usepackage{eurosym}
\usepackage{wasysym}
\usepackage{amsthm}
\usepackage{tabularx}
\usepackage{multirow}
\usepackage{multicol}
\usepackage{tikz}
\usepackage{mathtools}
\DeclarePairedDelimiter{\ceil}{\lceil}{\rceil}
\DeclarePairedDelimiter{\paren}{(}{)}
\usetikzlibrary{arrows}
\usetikzlibrary{chains}
\usetikzlibrary{backgrounds}
\usetikzlibrary{calc}
\usetikzlibrary{decorations.markings}
\usetikzlibrary{decorations.pathreplacing}
\usetikzlibrary{fit}
\usetikzlibrary{patterns}
\usetikzlibrary{positioning}
\usetikzlibrary{shapes}
\usepackage[binary-units]{siunitx}
\usepackage{hyperref}
\usepackage{tabularx}
\usepackage{commath}
\usepackage{graphicx,color}
\usepackage{subcaption}
\usepackage{float}
\usepackage{footmisc}
\usepackage{array}
\usepackage[underline=false]{pgf-umlsd}
\usetikzlibrary{calc}
%\usepackage[pdftex]{graphicx,color}
\usepackage{epstopdf}
\usepackage{pdfpages}
\usepackage{minted} % pygmentized source code
% Needed for murks.tex
\usepackage{setspace}
\usepackage[draft=false,babel,tracking=true,kerning=true,spacing=true]{microtype} % optischer Randausgleich etc.
% For german quotation marks
\newcommand{\degree}{\ensuremath{^\circ}}
\newcolumntype{P}[1]{>{\centering\arraybackslash}p{#1}}
\usepackage{fancyhdr}
\fancyhf{}
\fancyfoot[C]{\thepage}
\newcommand{\includenotebook}[2]{
\fancyhead[C]{Included Jupyter notebook: #1}
\includepdf[pages=1,
pagecommand={\thispagestyle{fancy}\section{#1}\label{#2_notebook}}
]{resources/#2.pdf}
\includepdf[pages=2-,
pagecommand={\thispagestyle{fancy}}
]{resources/#2.pdf}
}
\begin{document}
% Beispielhafte Nutzung der Vorlage für die Titelseite (bitte anpassen):
\input{murks}
\titelen{A Post-Attack Recovery Architecture for Smart Electricity Meters}
\titelde{Eine Architektur zur Kontrollwiederherstellung nach Angriffen auf Smart Metering in Stromnetzen}
\typ{Masterarbeit}
\grad{Master of Science (M. Sc.)}
\autor{Jan Sebastian Götte}
\gebdatum{Aus Datenschutzgründen nicht abgedruckt} % Geburtsdatum des Autors
\gebort{Aus Datenschutzgründen nicht abgedruckt} % Geburtsort des Autors
\gutachter{Prof. Dr. Björn Scheuermann}{Prof. Dr.-Ing. Eckhard Grass}
\mitverteidigung % entfernen, falls keine Verteidigung erfolgt %FIXME
\makeTitel
\selbstaendigkeitserklaerung{31.03.2020}
\newpage
% Hier folgt die eigentliche Arbeit (bei doppelseitigem Druck auf einem neuen Blatt):
\tableofcontents
\newpage
\chapter{Introduction}
\section{Structure and operation of the electrical grid}
\subsection{Structure of the electrical grid}
\subsubsection{Generators and loads}
\subsubsection{Transformers}
\subsubsection{Tie lines}
\subsection{Operational concerns}
\subsubsection{Modelling the electrical grid}
\subsubsection{Generator controls}
\subsubsection{Load shedding}
\subsubsection{System stability}
\subsubsection{Power System Stabilizers}
\subsubsection{Smart metering}
\section{Smart meter technology}
\subsubsection{Common components}
Smart meters usually are built around a standard microcontroller. \label{sm-cpu}
\subsubsection{Cryptographic coprocessors}
\subsubsection{Physical structure}
\subsubsection{Physical installation}
\section{Regulatory frameworks around the world}
\subsection{International standards}
\subsection{The regulatory situation in selected countries}
\subsubsection{Germany}
\subsubsection{France}
\subsubsection{the UK}
\subsubsection{Italy}
\subsubsection{Northern America}
\subsubsection{Japan}
\subsection{Common themes}
\section{Security in smart grids}
The smart grid in practice is nothing more or less than an aggregation of embedded control and measurement devices that
are part of a large control system. This implies that all the same security concerns that apply to embedded systems in
general also apply to most components of a smart grid in some way. Where programmers have been struggling for decades
now with input validation\cite{leveson01}, the same potential issue raises security concerns in smart grid scenarios as
well\cite{mo01, lee01}. Only, in smart grid we have two complicating factors present: Many components are embedded
systems, and as such inherently hard to update. Also, the smart grid and its control algorithms act as a large
(partially-)distributed system, making problems such as input validation or authentication difficult to
implement\cite{blaze01} and adding a host of distributed systems problems on top\cite{lamport01}.
Given that the electrical grid is a major piece of essential infrastructure in modern civilization, these problems
amount to significant issues in practice. Attacks on the electrical grid may have grave consequences\cite{lee01} all the
while the long maintenance cycles of various components make the system slow to adapt. Thus, components for the smart
grid need to be built to a much higher standard of security than most consumer devices to ensure they live up to
well-funded attackers even decades down the road. This requirement intensifies the challenges of embedded security and
distributed systems security among others that are inherent in any modern complex technological system.
A point we will not consider in much depth is theft of electricity. A large part of the motivation of the introduction
of smart meters seems to be % TODO weak statement
to reduce the level of fraud by consumers. Academic papers tend to either focus on other benefits such as generation
efficiency gains through better forecasting or try to rationalize the funamentally anti-consumer nature of smart
metering with strenuous claims of ``enormous social benefits''\cite{mcdaniel01}. We will entirely focus on grid
stability and discard electricity theft in the context of this paper for two reasons: One, billing inaccuracies of
electricity companies are of very low urgency compared to grid stability, and the one is a precondition for the other.
Two, utility companies can already put strong bounds on the amount of theft by simply cross-refrencing meter readings
against trusted readings from upstream sections of the grid. This capability works even without smart meters and only
gains speed from smart meters, just as the old exploit of bypassing the meter with a section of wire can't be prevented
like this.
Due to these bounds on its volume, electricity theft using smart meter hacking would not scale. Hackers would simply be
rooted up one by one with no damage to consumers and very limmited damage to utility companies. Damage in these
scenarios would be a far cry from the efficiency of an exponentially growing botnet.
\subsection{Smart grid components as embedded devices}
A fundamental challenge in smart grid implementations is the central role smart electricity meters play. Smart meters
are used both for highly-granular load measurement and (in some countries) load switching\cite{zheng01}.
Smart electricity meters are effectively consumer devices. They are built down to a certain price point that is
measured by the burden it puts on consumers and that is generally fixed by regulatory authorities. % FIXME cite
This requirement precludes some hardware features such as the use of a standard hardened software environment on a
high-powerded embedded system (such as a hypervirtualized embedded linux setup) that would both increase resilience
against attacks and simplify updates. Combined with the small market sizes in smart grid deployments
\footnote{
Most vendors of smart electricity meters only serve a handful of markets. For the most part, smart meter development
cost lies in the meter's software % TODO cite?
There exist multiple competing standards applicable to various parts of a smart electricity meter. In addition,
most countries have their own certification regimen\cite{cenelec01}. This complexity creates a large development
burden for new market entrants\cite{perez01}.
}
this produces a high cost pressure on the software development process for smart electricity meters.
\subsection{The state of the art in embedded security}
Embedded security generally is much harder than security of higher-level systems. This is due to a combination of the
unique constraints of embedded devices (hard to update, usually small quantity) and their lack of capabilities
(processing power, memory protection functions, user interface devices). Even very well-funded companies continue to
have serious problems securing their embedded systems. A spectacular example of this difficulty is the recently-exposed
flaw in Apple's iPhone SoC first-stage ROM bootloader\footnote{
Modern system-on-chips integrate one or several CPUs with a multitude of peripherals, from memory and DMA
controllers over 3D graphics accelerators down to general-purpose IO modules for controlling things like indicator
LEDs. Most SoCs boot from one of several boot devices such as flash memory, ethernet or USB according to a
configuration set e.g. by connecting some SoC pins a certain way or set by device-internal write-only fuse bits.
Physically, one of the processing cores of the SoC (usually one of the main CPU cores) is connected such that it is
taken out of reset before all other devices, and is tasked with switching on and configuring all other devices of
the SoC. In order to run later intialization code or more advanced bootloaders, this core on startup runs a very
small piece of code hard-burned into the SoC in the factory. This ROM loader initializes the most basic peripherals
such as internal SRAM memory and selects a boot device for the next bootloader stage.
Apple's ROM loader performs some authorization checks, to ensure no unauthorized software is loaded. The present
flaw allows an attacker to circumvent these checks, booting code not authorized by Apple on a USB-connected iPhone,
compromising Apple's chain of trust from ROM loader to userland right at its root.
}, that allows a full compromise of any iPhone before the iPhone X. iPhone 8, one of the affected models, is still being
manufactured and sold by Apple today\footnote{
i.e. at the time this paragraph was written, on %FIXME
}. In another instance, Samsung put a flaw in their secure-world firmware used for protection of sensitive credentials
in their mobile phone SoCs in % FIXME year % .
If both of these very large companies have trouble securing parts of their secure embedded software stacks measuring a
mere few hundred bytes in Apple's case or a few kilobytes in Samsung's, what is a smart electricity meter manufacturer
to do? For their mass-market phones, these two companies have R\&D budgets that dwarf some countries' national budgets.
% FIXME hyperbole?
% FIXME cite
Since thorough formal verification of code is not yet within reach for either large-scale software development or
code heavy in side-effects such as embedded firmware or industrial control software\cite{pariente01}
the two most effective measures for embedded security is reducing the amount of code on one hand, and labour-intensively
checking and double-checking this code on the other hand. A smart electricity manufacturer does not have a say in the
former since it is bound by the official regulations it has to comply with, and will almost certainly not have sufficient
resources for the latter.
% FIXME expand?
% FIXME cite some figures on code size in smart meter firmware?
\subsection{Attack avenues in the smart grid}
If we model the smart grid as a control system responding to changes in inputs by regulating outputs, on a very high
level we can see two general categories of attacks: Attacks that directly change the state of the outputs, and attacks
that try to influence the outputs indirectly by changing the system's view of its inputs. The former would be an attack
such as one that shuts down a power plant to decrease generation capacity. The latter would be an attack such as one
that forges grid frequency measurements where they enter a power plant's control systems to provoke increasing
oscillation in the amount of power generated by the plant according to the control systems' directions.
% FIXME cite
% FIXME expand
\subsubsection{Communication channel attacks}
Communication channel attacks are attacks on the communication links between smart grid components. This could be
attacks on IP-connected parts of the core network or attacks on shared busses between smart meters and IP gateways in
substations. Generally, these attacks can be mitigated by securing the aforementioned communication links using modern
cryptography. IP links can be protected using TLS, and more low-level busses can be protected using more lightweight
Noise\cite{perrin01}-based protocols.
Cryptographic security transforms an attackers ability to manipulate communication contents into a mere denial of
service attack. Thus, in addition to cryptographic security safety under DoS conditions must be ensured to ensure
continued system performance under attacks. This safety property is identical with the safety required to withstand
random outages of components, such as communications link outages due to physical damage from storms, flooding etc.
% FIXME cite papers on attack impact, on coutermeasures and on attack realization
In general, attacks at the meter level may be hard to weaponize % may be -> weak statement?
since meters are used mostly for billing and forecasting purposes % FIXME cite
and for more critical grid control purposes there exist several additional layers of sensors above smart meters that
limit how much an attacker can falsify smart meter readings without the manipulation being obvious. In order for an
attack to have more far-reaching consequences the attacker would need to compromise additional grid
infrastructure\cite{kim01,kosut01}.
\subsubsection{Exploiting centralized control systems}
The type of smart grid attack most often cited in popular discourse, and to the author's knowledge % FIXME verify, cite
the only type that has so far been conducted in practice, is a direct attack on centralized control systems. In this
attack, computer components of control systems are compromised by the same techniques used to compromise any other kind
of computer system such as exploiting insecure services running on internet-exposed ports and using one compromised
system to compromised other systems connected with it through an ostensably secure internal network. These attacks are
very powerful as they yield the attacker direct control over whatever outputs the control systems are controlling. If an
attacker manages to compromise a power stations control computers, they may be able to influence generation output or
even cause an emergency shutdown. % FIXME
Despite their potentially large impact, these attacks are only moderately interesting from a scientific perspective. For
one, their mitigation mostly consists of a straightforward application of security practices well-known for decades.
Though there is room for the implementation of genuinely new, application-specific security systems in this field, the
general state of the art is lacking behind the rest of the computer industry such that the low-hanging fruit should take
priority. % FIXME cite this bold claim very properly
In addition, given political will these systems can readily be secured since there is only a comparatively small number
of them and driving a technician to every one of them in turn to install some security update is perfectly feasible.
\subsubsection{Control function exploits}
Control function exploits are attacks on the mathematical control loops used by the centralized control system. One
example of such an attack would be resonance attacks as described in \textcite{wu01}.
In this kind of attack, inputs from peripheral sensors indicating grid load to the centralized control system are
carefully modified to cause a disproportionally large oscillation in control system action. This type of attack relies
on complex resonance effects that arise when mechanical generators are electrically coupled. These resonances,
coloquially called ``modes'' are well-studied in power system engineering\cite{rogers01,grebe01,entsoe01}.
% FIXME: refer to section on stability control above here
Even disregarding modern attack scenarios, for stability electrical grids are designed with measures in place to dampen
any resonances inherent to grid structure. Still, requiring an accurate grid model these resonances are hard to analyze
and unlikely to be noiticed under normal operating conditions.
Mitigation of these attacks is most easily done by on the one hand ensuring unmodified sensor inputs to the control
systems in the first place, and on the other hand carefully designing control systems not to exhibit exploitable
behavior such as oscillations.
% FIXME cite mitigation approaches
\subsubsection{Endpoint exploits}
One rather interesting attack on smart grid systems is one exploiting the grid's endpoint devices such as smart
electricity meters\footnote{
Though potentially this could also aim at other kinds of devices distributed on a large scale such as sensors in
unmanned substations. % FIXME cite verify
}
These meters are deployed on a massive scale, with several thousand meters deployed for every substation.
% FIXME cite (this should be straightforward)
Thus, once compromised restoration to an uncompromised state can be potentially very difficult if it requires physical
access to thousands of devices hidden inaccessible in private homes.
By compromising smart electricity meters, an attacker can trivially forge the distributed energy measurements these
devices perform. In a best-case scenario, this might only affect billing and lead to customers being under- or
over-charged if the attack is not noticed in time. However, in a less ideal scenario the energy measurements taken by
these devices migth be used to inform the grid centralized control systems % FIXME cite
and a falsification of these measurements might lead to inefficiency.
In some countries and for some customers, these smart meters have one additional function that is highly useful to an
attacker: They contain high-current load switches to disconnect the entire household or business in case electricity
bills are left unpaid for a certain period. In countries that use these kinds of systems, the load disconnect is often
simply hooked up to one of the smart merter's central microcontroller's general-purpose IO pins, allowing anyone
compromising this microcontroller's firmware to actuate the load switch at will. % FIXME validate cite add pictures
Given control over a large number of network-connected smart meters, an attacker might thus be able to cause large-scale
disruptions of power consumption by repeatedly disconnecting and re-connecting a large number of consumers.
% FIXME cite some analysis of this
Combined with an attack method such as the resonance attack from \textcite{wu01}
that was mentioned above, this scenario poses a serious danger to grid stability.
% FIXME add small-scale load shedding for heaters etc.
\subsection{Attacker models in the smart grid}
\subsection{Practical attacks}
\subsection{Practical threats}
\subsection{Conclusion, or why we are doomed}
We can conclude that a compromise of a large number of smart electricity meters cannot be ruled out. The complexity of
network-connected smart meter firmware makes it exceedingly unlikely that it is in fact flawless. Large-scale
deployments of these devices under some circumstances such as where they are used with load disconnect relays make them
an attractive target for attackers interested in causing grid instability. The attacker model for these devices very
definitely includes enemy states, who have considerable resources at their disposal.
For a reasonable guarantee that no large-scale compromises of hard- and software built today will happen over a span of
some decades, we would have to radically simplify its design and limit attack surface. Unfortunately, the complexity of
smart electricity meter implementations mostly stems from the large list of requirements these devices have to conform
with. Additionally, standards have already been written and changes that reduce scope or functionality have become
exceedingly unlikely at this point.
A general observation with smart grid systems of any kind is that they comprise a zealous departure of the decentralized
control structure of yesterday's dumb grid and the advent of centralization at an enormous scale. This modern,
centralized infrastructure has been carefully designed to defend against malicious actors%FIXME cite
and all involved parties have an interest in keeping it secure. Still, like in any other system this centralization also
makes a very attractive target for attackers since an attacker can likewise employ this centralized control to their
goals. Fundamentally, decentralized systems tend to make attacks of any kind a lot more costly and one might question
whether security has truly been gained during smart grid rollout. % FIXME hot take maybe
\chapter{Restoring endpoint safety in an age of smart devices}
If as layed out in the previous paragraph we cannot rule out a large-scale compromise of smart energy meters, we have to
rephrase our claim to security. If we cannot rule out exploitation, we have to limit its impact. If we assume that we
cannot strip any functionality from smart meters since it may be required by standards or for enormous social
benefits\cite{mcdaniel01} % FIXME is sarcasm ok here?
all we can do is to flush out an attacker once they are in.
In a worst-case scenario an attacker would gain unconstrained code execution e.g. by exploiting a flaw in a network
protocol implentation. Since smart meters use standard microcontrollers that do not have advanced memory protection
functions (see pg. \ref{sm-cpu}), at this point we can assume the attacker has full control over the main
microcontroller. With this control they can actuate the load switch if present, transmit data through the device's
communication interfaces or use the user interface components such as LEDs and the LCD. Using the self-programming
capabilities of modern flash microcontrollers, an attacker may even gain persistency without much trouble. Note that in
systems separating cryptographic functions into some form of cryptographic module such as systems used in Germany
% TODO list other countries as well? FIXME cite BSI standard requiring this
we can be optimistic and assume the attacker has not in fact compromised this cryptographic co-processor yet and does
not have access to any cryptographic secrets yet.
Given that the attacker has complete control over the meter's core microcontroller and given that due to cost
constraints we are bound to use whatever microcontroller the meter OEM has chosen for their design, we cannot rely on
software running on the core mircocontroller to restore system integrity.
Our solution to this problem is to add another, very small microcontroller to the smart meter design. This
microcontroller will contain a small piece of software to receive cryptographically authenticated commands from utility
companies and on demand reset the meter's core microcontroller to a known-good state. We have to assume the code in the
core controller's flash memory has been compromised, so our only option to flush out an attacker is to re-program the
core microcontroller in its entirety. We propose using JTAG to re-program the core microcontroller
% TODO get terminology consistent. Is "core microcontroller" a good term here?
with a known-good firmware image read from a sufficiently large SPI flash connected to the reset controller. JTAG is
supported by most microcontrollers complex enough to end up in a smart meter design % TODO colloquialism
and given adequate documentation JTAG programming functionality can be ported to new microcontrollers with relatively
little work.
On the microcontroller side our solution requires the JTAG interface to be activated (i.e. not fused-shut) and for our
solution to work core microcontroller firmware must not be able to permanently disable the JTAG interface from within.
In microcontrollers that do not yet provide this functionality this is a minor change that could be added to a custom
microcontroller variant at low cost. On most microcontrollers keeping JTAG open should not interfere with code readout
protection. Code secrecy should be of no concern\cite{schneier01} here but besides security manufacturers have strong
preferences about this due to fear of copyright infringement.
\section{The theory of endpoint safety}
\label{sec_criteria}
In order to gain anything by adding our reset controller to the smart meter's already complex design we must satisfy two
interrelated conditions.
\begin{enumerate}
\item \textsc{security} means our reset controller itself does not have any remotely exploitable flaws
\item \textsc{safety} menas our reset controller will perform its job as intended
\end{enumerate}
Note that our \textsc{security} property includes only remote exploitation, and excludes any form of hardware attack.
Even though most smart meters provide some level of physical security, we do not wish to make any assumptions on this.
In the following section we will elaborate our attacker model and it will become apparent that sufficient physical
security to defend against all attackers in our model would be infeasible, and thus we will design our overall system
to remain secure even assuming some number of physically compromised devices.
% FIXME expand
\subsection{Attack characteristics}
The attacker model these two conditions must hold under is as follows. We assume three angles of attack: Attacks by the
customer themselves, attacks by an insider within the metering systems controlling utility company and lastly attacks
from third parties. Examples for these third parties are hobbyist hackers or outside cyber-criminals on the one hand,
but also other companies participating in the smart grid infrastructure besides the utility company such as intermediary
providers of meter-reading services.
Due to the critical nature of the electrical grid, we have to include hostile state actors in our attacker model. When
acting directly, these would be classified as third-party attackers by the above schema, but they can reasonably be
expected to be able to assume either of the other two roles as well e.g. through infiltration or bribery.
\textcite{fraunholz01} in their elaboration of their generalized attacker model give some classification of attackers
and provide a nice taxonomy of attacker properties. In their threat/capability rating, criminals are still considered
to have higher threat rating than state-sponsored attackers. The New York Times reported in 2016 that some states
recruit their hacking personnel in part from cyber-criminals. If this report is true, in a worst-case scenario we have
to assume a state-sponsored attacker to be the worst of both types. Comparing this against the other attacker types in
\textcite{fraunholz01}, this state-sponsored attacker is strictly worse than any other type in both variables. We are
left with a highly-skilled, very well-funded, highly intentional and motivated attacker.
Based on the above classification of attack angles and our observations on state-sponsored attacks, we can adapt
\textcite{fraunholz01} to our problem, yielding the following new attacker types:
\begin{enumerate}
\item \textbf{Utility company insiders controlled by a state actor}
We can ignore the other internal threats described in \textcite{fraunholz01} since an insider cooperating with a
state actor is strictly worse in every respect.
\item \textbf{State-sponsored external attackers}
A state actor can obviously directly attack the system through the internet.
\item \textbf{Customers controlled by a state actor}
A state actor can very well compromise some customers for their purposes. They might either physically
infiltrate the system posing as legitimate customers, or they might simply deceive or bribe existing customers
into cooperation.
\item \textbf{Regular customers}
Though a hostile state actor might gain control of some number of customers through means such as voluntary
cooperation, bribery, infiltration, they are limited in attack scale since they do not want to arouse premature
attention. Though regular customers may not have the motivation, skill or resources of a state-sponsored
attacker, potentially large numbers of them may try to attack a system out of financial incentives. To allow for
this possibility, we consider regular customers separate from state actors posing as customers in some way.
\end{enumerate}
\subsection{Overall structural system security}
Considering overall security, we first introduce the \emph{reset authority}, a trusted party acting as the single
authority for issuing reset commands in our system. In practice this trusted party may be part of the utility company,
part of an external regulatory body or a hybrid setup requiring both to cooperate. We assume this party will be designed
to be secure against all of the above attacker types. The precise design of this trusted party is out of scope for this
work but we will list some practical suggestions on how to achieve security below. % FIXME do the list
% FIXME put up a large box on this limitation
Using an asymmetric cryptographic design centered around the \emph{reset authority}, we rule out all attacks except for
denial-of-service attacks on our system by any of the four attacker types. All reset commands in our system originate
from the \emph{reset authority} and are cryptographically secured to provide authentication and tamper detection.
Under this model, attacks on the electrical grid components between the \emph{reset authority} and the customer device
degrade into man-in-the-middle attacks. To ensure the \textsc{safety} criterion from \ref{sec_criteria} holds we must
% FIXME check whether this \ref displays as intended
make sure our cryptography is secure against man-in-the-middle attacks and we must try to harden the system against
denial-of-service attacks by the attacker types listed above. Given our attacker model we cannot fully guard against
this sort of attack but we can at least choose a commmunication channel that is resilient against denial of service
attacks under the above model.
Finally, we have to consider the issue of hardware security. We will solve the problem of physical attacks on some small
number of devices by simply not programming any secret information into these devices. This also simplifies hardware
production. From consideration in this work we explicitly rule out any form of supply-chain attack as
out-of-scope.
% FIXME include considerations on production testing somewhere (is the device working? is the right key programmed?)
\subsection{Complex microcontroller firmware}
The \textsc{security} property from \ref{sec_criteria} is in a large part reliant on the security of our reset
controller firmware. The best method to increase firmware security is to reduce attack surface by limiting external
interfaces as much as possible and by reducing code complexity as much as possible.
% FIXME formalize this as something like "Design Goal DG-023-42-1" ?
If we avoid the complexity of most modern microcontroller firmware we gain another benefit beyond implicitly reduced
attack surface: If the resulting design is small enough we may attempt formal verification of our security property.
Though formal verification tools are not yet suitable for highly complex tasks they are already barely adequate for
small amounds of code and simple interfaces.
\subsection{Modern microcontroller hardware}
Microcontrollers have gained enormously in both performance/efficiency as well as in peripheral support. Alas, these
gains have largely been driven by insatiable customer demand for faster, more powerful chips and for a long time
security has not been considered important outside of some specific niches such as smartcards. Traditionally a
microcontroller would spend its entire lifetime without ever being exposed to any networks. Though this trend has been
reversing with the increasing adoption of internet-of-things things
and more advanced security features have started appearing in general-purpose microcontrollers, most still lack even
basic functionality found in processors for computers or smartphones.
One of the components lacking from most microcontrollers is strong memory protection or even a memory mapping unit as
it is found in all modern computer processors and SoCs for applications such as smartphones. Without an MPU/MPU some
mitigations for memory safety violations cannot be implemented. This and the absence of virtualization tools such as
ARM's TrustZone make hardening microcontroller firmware a big task. It is very important to ensure memory safety in
microcontroller firmware through tools such as defensive coding, extensive testing and formal verification.
In our design we achieve simplicity on two levels: One, we isolate the very complex metering firmware from our reset
controller by having both run on separate microcontrollers. Two, we keep the reset controller firmware itself extremely
simple to reduce attack surface there.
\subsection{Regulatory and economical constraints}
%FIXME
\subsection{Safety vs. Security: Opting for restoration instead of prevention}
%FIXME
\subsection{Technical outline of a safety reset system}
%FIXME
\section{Communication channels on the grid}
There is a number of well-established technologies for communication on or along power lines. We can distinguish three
basic system categories: Systems using separate wires (such as DSL over landline telephone wiring), wireless radio
systems (such as LTE) and \emph{powerline communication} (PLC) systems that re-use the existing mains wiring and
superimpose data transmissions on the 50 Hz mains sine\cite{gungor01,kabalci01}.
For our scenario, we will ignore short-range communication systems. There exists a large number of \emph{wideband}
powerline communication systems that are popular with consumers for bridging ethernet between parts of an apartment or
house. These systems transmit at up to several hundred megabits over distances up to several tens of
meters\cite{kabalci01}. Technologically, these wideband PLC systems are very different from \emph{narrowband} systems
used by utilities for load management among other applications and they are not relevant to our analysis.
\subsection{Powerline communication (PLC) systems and their use}
In long-distance communications for applications such as load management, PLC systems are attractive since they allow
re-using the existing wiring infrastructure and have been used as early as in the 1930s\cite{hovi01}. Narrowband PLC
systems are a potentially low-cost solution to the problem of transmitting data at small bandwidth over distances of
several hundred meters up to tens of kilometers.
Narrowband PLC systems transmit on the order of kilobits per second or slower. A common use of this sort of system are
\emph{ripple control} systems. These systems superimpose a low-frequency signal at some few hundred Hertz carrier
frequency on top of the 50Hz mains sine. This low-frequency signal is used to encode switching commands for
non-essential residential or industrial loads. Ripple control systems provide utilities with the ability to actively
control demand while promising small savings in electricity cost to consumers\cite{dzung01}.
In any PLC system there is a strict tradeoff between bandwidth, power and distance. Higher bandwidth requires higher
power and reduces maximum transmission distance. Where ripple control systems usually use few transmitters to cover
the entire grid of a regional distribution utility, higher-bandwidth bidirectional systems used for automatic meter
reading (AMR) in places such as italy or france require repeaters within a few hundred meters of a transmitter.
\subsection{Landline and wireless IP-based systems}
Especially in automated meter reading (AMR) infrastructure the cost-benefit tradeoff of powerline systems does not
always work out for utilities. A common alternative in these systems is to use the public internet for communication.
Using the public internet has the advantage of low initial investment on the part of the utility company as well as
quick commissioning. Disadvantages compared to a PLC system are potentially higher operational costs due to recurring
fees to network providers as well as lower reliability. Being integrated into power grid infrastructure, a PLC system's
failure modes are highly correlated with the overall grid. Put briefly, if the PLC interface is down, there is a good
chance that power is out, too. In contrast to this general internet services exhibit a multitude of failures that are
entirely decorrelated from power grid stability.
For purposes such as meter reading for billing purposes, this stability is sufficient. However for systems that need to
hold up in crisis situations such as the recovery system we are contemplating in this thesis, the public internet may
not provide sufficient reliability.
\subsection{Proprietary wireless systems}
% FIXME
\subsection{Frequency modulation as a communication channel}
For our system, we chose grid frequency modulation (henceforth GFM) as a low-bandwidth uni-directional broadcast
communications channel. Compared to traditional PLC GFM requires only a small amount of additional hardware, works
reliably throughout the grid and is harder to manipulate by a malicious actor.
Grid frequency in europe's synchronous areas is nominally 50 Hertz, but there are small load-dependent variations from
this nominal value. Any device connected to the power grid (or even just within physical proximity of power wiring) can
reliably and accurately measure grid frequency at low hardware overhead. By intentionally modifying grid frequency, we
can create a very low-bandwidth broadcast communication channel. Grid frequency modulation has only ever been proposed
as a communications channel at very small scales in microgrids before\cite{urtasun01} but to our knowledge has not yet
been considered for large-scale application.
Advantages of using grid frequency for communication are low receiver hardware complexity as well as the fact that a
single transmitter can cover an entire synchronous area. Though the transmitter has to be very large and powerful, setup
of a single large transmitter faces lower bureaucratic hurdles than integration of hundreds of smaller ones into
hundreds of local systems each with autonomous goverance.
\subsubsection{The frequency dependance of grid frequency}
Despite the awesome complexity of large power grids the physics underlying their response to changes in load and
generation is surprisingly simple. Individual machines (loads and generators) can be approximated by a small number of
differential equations and the entire grid can be modelled by aggregating these approximations into a large system of
linear differential equations. Evaluating these systems it has been found that in large power grids small-signal
steady-state changes in generation/consumption power balance cause a linear change in
frequency\cite{kundur01,entsoe02,entsoe04}. \emph{Small signal} here describes changes in power balance that are small
compared to overall grid power. \emph{Steady state} describes changes over a timeframe of multiple cycles as opposed to
transient events that only last a few milliseconds.
This approximately linear relationship allows the specification of a coefficient linking $\Delta P$ and $\Delta f$ with
unit \si{\watt\per\hertz}. In this thesis we are using the European power grid as our model system. We are
using data provided by ENTSO-E (formerly UCTE), the governing association of european transmission system operators. In
our calculations we use data for the continental european synchronous area, the largest synchronous area. $\frac{\Delta
P}{\Delta f}$, called \emph{Overall Network Power Frequency Characteristic} by ENTSO-E is around
\SI{25}{\giga\watt\per\hertz}.
We can derive general design parameter for any system utilizing grid frequency as a communications channel from the
policies of ENTSO-E\cite{entsoe02,entsoe03}. Probably any such system should stay below a modulation amplitude of
\SI{100}{\milli\hertz} which is the threshold defined in the ENTSO-E incidents classification scale for a Scale 0-1
(from "Anomaly" to "Noteworthy Incident" scale) frequency degradation incident\cite{entsoe03} in the continental europe
synchronous area.
\subsubsection{Control systems coupled to grid frequency}
The ENTSO-E Operations Handbook Policy 1 chapter defines the activation threshold of primary control to be
\SI{20}{\milli\hertz}. Ideally a modulation system would stay well below this threshold to avoid fighting the primary
control reserve. Modulation line rate should probably be on the order of a few hundred millibaud.
% FIXME is using "probably" here and in the previous paragraph ok?
Modulation at such high rates would outpace primary control action which is specified by ENTSO-E as acting within
between ``a few seconds'' and \SI{15}{\second}.
The effective \emph{Network Power Frequency Characteristic} of primary control in the european grid is reported by
ENTSO-E at around \SI{20}{\giga\watt\per\hertz}. Keeping modulation amplitude below this threshold would help to avoid
spuriously triggering these control functions. This works out to an upper bound on modulation power of
\SI{20}{\mega\watt\per\milli\hertz}.
\subsubsection{Practical transmitter implementation}
In its most basic form a transmitter for grid frequency modulation would be a very large controllable load connected to
the power grid at a suitable vantage point. A spool of wire submerged in a body of cooling water (such as a small lake
with a fence around it) along with a thyristor rectifier bank would likely suffice to perform this function during
occassional cybersecurity incidents. We can however decrease hardware and maintenance investment even further compared
to this rather uncultivated solution by repurposing regular large industrial loads to our transmitter purposes in an
emergency situation. For some preliminary exploration we went through a list of energy-intensive industries in
Europe\cite{ec01}. The most electricity-intensive industries in this list are primary aluminium and steel production.
In primary production raw ore is converted into raw metal for further refinement such as casting, rolling or extrusion.
In steelmaking iron is smolten in an electric arc furnace. In aluminium smelting aluminium is electrolytically extracted
from alumina. Both processes involve large amounts of electricity with electricity making up \SI{40}{\percent} of
production costs. Given these circumstances a steel mill or aluminium smelter would be good candidates as transmitters
in a grid frequency modulation system.
In aluminium smelting high-voltage mains is transformed, rectified and fed into about 100 series-connected cells forming
a \emph{potline}. Inside the pots alumina is dissolved in molten cryolite electrolyte at about
\SI{1000}{\degreeCelsius} and electrolysis is performed using a current of tens or hundreds of kiloampere. Resulting
pure aluminium settles at the bottom of the cell and is tapped off for further processing.
Like steelworks, aluminium smelters are operated night and day without interruption. Aside from metallurgical issues the
large thermal mass and enormous heating power requirements do not permit power-cycling. Due to the high costs of
production inefficiencies or interruptions the behavior of aluminium smelters under power outages is a fairly
well-characterized phenomenon in the industry. The recent move away from nuclear power and to renewable energy has lead
to an increase in fluctuations of electricity price throughout the day. These electricity price fluctuations have
provided enough economic incentive to aluminium smelters to develop techniques to modulate smelter power consumption
without affecting cell lifetime or the output product\cite{duessel01,eisma01}. Power outages of tens of minutes up to
two hours reportedly do not cause problems in aluminium potlines and are in fact part of routine operation for purposes
such as electrode changes\cite{eisma01,oye01}.
The power supply system of an aluminium plant is managed through a highly-integrated control system as keeping all cells
of a potline under optimal operating conditions is challenging. Modern power supply systems employ large banks of diodes
or SCRs to rectify low-voltage AC to DC to be fed into the potline\cite{ayoub01}. The potline voltage can be controlled
almost continuously through a combination of a tap changer and a transductor. The individual cell voltages can be
controlled by changing the anode to cathode distance (ACD) by physically lowering or raising the anode. The potline
power supply is connected to the high voltage input and to the potline through isolators and breakers.
In an aluminium smelter most of the power is sunk into resistive losses and the electrolysis process. As such an
aluminium smelter does not have any significant electromechanical inertia compared to the large rotating machines used
in other industries. Depending on the capabilities of the rectifier controls high slew rates should be possible,
permitting modulation at high\footnote{Aluminium smelter rectifiers are \emph{pulse rectifiers}. This means instead of
simply rectifying the incoming three-phase voltage they use a special configuration of transformer secondaries and in
some cases additional coils to produce a large number (such as 6) of equally spaced phases. Where
a direct-connected three-phase rectifier would draw current in 6 pulses per cycle a pulse rectifier draws current in
more, smaller pulses to increase power factor. E.g. a 12-pulse rectifier will draw current in 12 pulses per cycle. In
the best case an SCR pulse rectifier switched at zero crossing should allow \SIrange{0}{100}{\percent} load changes from
one rectifier pulse to the next, i.e. within a fraction of a single cycle.} data rates.
% FIXME validate this \subsubsection with an expert
\subsubsection{Avoiding dangerous modes}
Modern power systems are complex electromechanical systems. Each component is controlled by several carefully tuned
feedback loops to ensure voltage, load and frequency regulation. Multiple components are coupled through transmission
lines that themselves exhibit complex dynamic behavior. The overall system is generally stable, but may exhbit some
instabilities to particular small-signal stimuli. These instabilities, called \emph{modes} occur when due to mis-tuning
of parameters or physical constraints the overall system exhibits oscillation at particular frequencies.
\textcite{kundur01} split these into four categories:
\begin{description}
\item[Local modes] where a single power station oscillates in some parameter
\item[Interarea modes] where subsections of the overall grid oscillate w.r.t.\ each other due to weak coupling
between them
\item[Control modes] caused by imperfectly tuned control systems
\item[Torsional modes] that originate from electromechanical oscillations in the generator itself
\end{description}
The oscillation frequencies associated with each of these modes are usually between a few tens of Millihertz and a few
Hertz, see for example \textcite{grebe01} and \textcite{entsoe01}. It is hard to predict the particular modes of a
power system at the scale of the central-european interconnected system. Theoretical analysis and simulation may give
rough indications but cannot yield conclusive results. Due to the obvious danger as well as high economical impact due
to inefficiencies experimental measurements are infeasible. Finally, modes are highly dependent on the power grid's
structure and will change with changes in the power grid over time. For all of these reasons, a grid frequency
modulation system must be designed very conservatively without relying on the absence (or presence) of modes at
particular frequencies. A concrete design guideline that we can derive from this situation is that the frequency
spectrum of any grid frequency modulation system should not exhibit any notable peaks and should avoid a concentration
of spectral energy in certain frequency ranges.
\subsubsection{Overall system parameters}
% FIXME
\subsubsection{An outline of practical implementation}
% FIXME
\section{From grid frequency to a reliable communications channel}
% FIXME
\subsection{Channel properties}
% FIXME
\subsection{Modulation and its parameters}
The sensitivity of the grid to oscillation at particular frequencies described above means we should avoid any
modulation technique that would concentrate a lot of energy in a small bandwidth. Taking this principle to its extreme
provides us with a useful pointer towards techniques that might work well: Spread-spectrum techniques. By employing
spread-spectrum modulation we can produce an almost ideal frequency-domain behavior that spreads the modulation energy
almost flat across the modulation bandwidth\cite{goiser01} while at the same time achieving some modulation gain,
increasing system sensitivity. This modulation gain spread-spectrum techniques yield potentially allows us to use a
weaker stimulus, allowing further reduction of the probability of disturbance to the overall system. Spread-spectrum
techniques also inherently allow us to tune the tradeoff between receiver sensitivity and data rate. This tunability is
a highly useful parameter to have for the overall system design.
Spread spectrum covers a whole family of techniques. \textcite{goiser01} separates these techniques into the coarse
categories of \emph{Direct Sequence Spread Spectrum}, \emph{Frequency Hopping Spread Spectrum} and \emph{Time Hopping
Spread Spectrum}.
\textcite{goiser01} assumes a BPSK or similar modulation underlying the spread-spectrum technique. Our grid frequency
modulation channel effectively behaves more like a DC-coupled wire than a traditional radio channel: Any change in
excitation will cause a proportional change in the receiver's measurement. Using our fft-based measurement methodology
we get a real-valued signed quantity. In this way grid frequency modulation is similar to a channel using coherent
modulation. We can transmit not only signal strength, but polarity too.
For our purposes we can discount both Time and Frequency Hopping Spread Spectrum techniques. Time
hopping aids to reduce interference between multiple transmitters but does not help with SNR any more than Direct
Sequence does. % FIXME verify this.
Our system is strictly limited to a single transmitter so we do not gain anything through Time Hopping.
Frequency Hopping Spread Spectrum techniques require a carrier. Grid frequency modulation itself is very limited in
peak frequency deviation $\Delta f$. Frequency hopping could only be implemented as a second modulation on top of GFM,
but this would not yield any benefits while increasing system complexity and decreasing data bandwidth.
Direct Sequence Spread Spectrum is the only remaining approach for our application. Direct Sequence Spread Spectrum
works by directly modulating a long pseudorandom bit sequence onto the channel. The receiver must know the same
pseudo-random bit sequence and continuously calculates the correlation between the received signal and the pseudo-random
template sequence mapped from binary $[0, 1]$ to bipolar $[1, -1]$. The pseudorandom sequence has approximately equal
number of $0$ and $1$ bits the correlation between the sequence and uncorrelated noise is small. The positive
contribution of the $+1$ terms of the correlation template approximately cancel out with the $-1$ terms when multiplied
with an uncorrelated signal such as white gaussian noise or another pseudo-random sequence.
By using a family of pseudo-random sequences with low cross-correlation channel capacity can be increased. Either the
transmitter can encode data in the choice of sequence or multiple transmitters can use the same channel at once. The
longer the pseudo-random sequence the lower its cross-correlation with noise or other pseudorandom sequences of the same
length. Choosing a long sequence we increase modulation gain while decreasing bandwidth. For any given application the
sweet spot will be the shortest sequence that is long enough to yield sufficient SNR for subsequent processing layers
such as channel coding.
A popular code used in many DSSS systems are Gold codes. A set of Gold codes has small cross-correlations. For some
value $n$ a set of Gold codes contains $2^n + 1$ sequences of length $2^n - 1$. Gold codes are generated from two
different maximum length sequences generated by linear feedback shift registers (LFSRs). For any bit count $n$ there are
certain empirically determined preferred pairs of LFSRs that produce Gold codes with especially good cross-correlation.
The $2^n + 1$ gold codes are defined as the XOR sum of both LFSR sequences shifted from $0$ to $2^n-1$ bit as well as
the two individual LFSR sequences. Given LFSR sequences \texttt{a} and \texttt{b} in numpy notation this is
\mintinline{python}{[a, b] + [ a ^ np.roll(b, shift) for shift in len(b) ]}.
In DSSS modulation the individual bits of the DSSS sequence are called \emph{chips}. Chip duration determines modulation
bandwidth\cite{goiser01}. In our system we are directly modulating DSSS chips on mains frequency without an underlying
modulation such as BPSK as it is commonly used in DSSS systems.
\subsection{Error-correcting codes}
To make our overall system reliable we have to layer some channel coding on top of our DSSS modulation. The messages we
expect to transmit are at least a few tens of bits long. We are highly constrained in SNR due to limited transmission
power. With lower SNR comes higher BER (bit error rate). Packet error rate grows exponentially with transmission length.
For our relatively long transmissions we would realistically get unacceptable error rates.
Error correcting codes are a very broad field with many options for specialization. Since we are implementing nothing
more than a prototype in this thesis we chose to not expend resources on optimization too much and settled for a
comparatively simple low-density parity check code. The state of the art has advanced considerably since the discovery
of general LDPC codes. %FIXME cite
% FIXME LDPC is old, new is Reed-Solomon!
The main areas of improvement are overhead and decoding speed. Since transmission length % FIXME have we defined this yet?
in our system limits system response time but we do not have a fixed target there we can tolerate some degree of
sub-optimal overhead. % FIXME get actual pröper numbers on our stuff vs. some state of the art citations.
Decoding speed is of no concern to us as our data rate is extremely low.
An important concern for our prototype implementation was the availability of reference implementations of our error
correcting code. We need a python implementation for test signal generation on a regular computer and we need a small C
or C++ implementation that we can adapt to embedded firmware. LDPC codes are a popular textbook example of
error-correcting codes and we had no particular difficulty finding either.
\subsection{Cryptographic security}
\label{sec-crypto}
Informally the system we are looking for can be modelled as consisting of three parties: The trusted
\textsc{Transmitter}, one of a large number of untrusted \textsc{Receivers}, and an \textsc{Attacker}. These three play
according to the following rules:
\begin{enumerate}
\item \textsc{Transmitter} and \textsc{Attacker} can both transmit any bit sequence
\item \textsc{Receiver} receives any transmission by either \textsc{Transmitter} or \textsc{Attacker} but cannot
distinguish between the two on the signal level
\item \textsc{Attacker} knows anything a \textsc{Receiver} might know
\item \textsc{Transmitter} is stronger than \textsc{Attacker} and will ``win'' in simultaneous transmission
\item Both \textsc{Transmitter} and \textsc{Receiver} can be seeded with some information on each other such as
public key fingerprints.
\end{enumerate}
We are not interested in congestion scenarios where an attacker attempts to disrupt an ongoing transmission by the
transmitter. In practice there are several avenues to prevent such attempts including the following. Compromised loads
that are being abused by the attacker can be manually disconnected by the utility. Error-correcting codes can be used to
provide resiliency against small-scale disturbances. Finally, the transmitter can be designed to have high enough power
to be able to override any likely attacker.
Our goal is to find a cryptographic primitive that has the following properties:
\begin{enumerate}
\item \textsc{Transmitter} can produce a transmission bit sequence $\mathbf{s}$ (or equivalently a set of such
sequences) that \textsc{Receiver} can uniquely identify as being generated by \textsc{Transmitter}:
$\mathcal{R}\left(\mathbf{s}\right) = 1$. Upon reception of this sequence, \textsc{Receiver} performs the safety
reset.
\item \textsc{Attacker} cannot forge $\mathbf{s}$, that is find $\mathbf{s}'$ such that
$\mathbf{s} \neq \mathbf{s}' \land \mathcal{R}\left(\mathbf{s}'\right) = 1$
\item Our system conforms to an at-most-once semantic. That is, upon transmission of a valid bit sequence coded for
a particular \textsc{Receiver} or set of receivers each one either performs exactly one safety reset or none at
all. We cannot achieve an exactly-once semantic since we are using an unidirectional lossy communication
primitive. More coloquially, \textsc{Receiver} might be offline due to a localized power outage and might thus
not hear \textsc{Transmitter} even if our broadcast primitive is reliable. The practical impact of this
limitation can be mitigated by transmitter simply repeating itself until the desired effect has been achieved.
\end{enumerate}
An important limitation from the rules of our setup above is that \textsc{Attacker} can always record the bit sequence
\textsc{Transmitter} transmits and replay that same sequence later. Before considering any cryptographic approaches we
can make the preliminary observation that we can trivially prevent \textsc{Attacker} from violating the
at-most-once criterion by simply requiring \textsc{Receiver} to memorize all bit sequences that have been transmitted
thus far and only reacting to new bit sequences. This means an attacker might be able to cause offline receivers to
reset at a later point, but considering our goal is to reset them in the first place this would not pose a danger to the
system.
% FIXME elaborate why this is not a threat, and possible mitigations
As it seems we need a cryptographic primitive that looks somewhat like a signature. Different from a signature however,
we have somewhat relaxed constraints here: While cryptographic signatures need to work over arbitrary inputs, all we
want to ``sign'' here is the instruction to perform a safety reset. Since this is the only message we might ever want to
transmit, our message space has only one entry and thus the informational content of our message is 0 bit! All the
information we want to transmit is already encoded \emph{in the fact that we are transmitting}, and we do not require
any further payload to be transmitted. This means we can omit the entirety of the message and just transmit whatever
``signature'' we produce. This is useful since we have to conserve transmission bits so our transmissions do not take
exceeedingly long time over our extremely slow communication channel.
We could use any of several traditional asymmetric cryptographic primitives to produce these signatures. The
comparatively high computational effort required for signature verification would not be an issue. Transmissions take
several minutes anyway and we can afford to spend some tens of seconds even in signature verification. Transmission
length and by proxy system latency would be determined by the length of the signature. For RSA signature length is the
modulus length (i.e. larger than 1000 bit for even basic contemporary security). For elliptic curve-based systems
signature size is approximately twice the curve length (i.e. ~300 bit for contemporary security). However, we can do
better than this: We can exploit the strange nature of our setting that our effective message entropy is 0 bit to derive
a more efficient scheme.
\subsubsection{Lamport signatures}
In 1979, \textcite{lamport02} introduced a signature scheme that is based only on a one-way function such as a
cryptographic hash function. The basic observation is that by choosing a random secret input to a one-way function and
publishing the output, one can later prove knowledge of the input by simply publishing it. In the following paragraphs
we will describe a construction of a one-time signature scheme based on this observation. The scheme we describe is the
one usually called a ``Lamport Signature'' in modern literature and is slightly different from the variant described in
the 1979 paper, but for our purposes we can consider both to be equivalent.
\paragraph{Setup.} In a Lamport signature, for an n-bit hash function $H$ the signer generates a private key $s =
\left(s_{b, i} | b\in\left\{0, 1\right\}, 0\le i<n\right)$ of $2n$ random strings of length $n$. The signer publishes a
public key $p = \left(p_{b, i} = H\left(s_{b, i}\right), b\in\left\{0, 1\right\}, 0\le i<n\right)$ that is simply the
list of hashes of each of the random strings that make up the private key.
\paragraph{Signing.} To sign a message $m$, the signer publishes the signature $\sigma = \left(\sigma_i = k_{H(m)_i,
i}\right)$ where $H(m)_i$ is the $i$-th bit of $H$ applied to $m$. That is, for the $i$-th bit of the message's hash
$H(m)$ the signer publishes either of $p_{0, i}$ or $p_{1, i}$ depending on the hash bit's value, keeping the other
entry of $P$ secret.
\paragraph{Verification.} The verifier can compute $H(m)$ themselves and check the corresponding entries $\sigma_i =
k_{H(m)_i}$ of $S$ correctly evaluate to $p_{b, i} = H\left(s_{b, i}\right)$ from $P$ under $H$.
The above scheme is a one-time signature scheme only. After one signature has been published for a given key, the
corresponding key must not be re-used for other signatures. This is intutively clear as we are effectively publishing
part of the private key as the signature, and if we were to publish a signature for another message an attacker could
derive additional signatures by ``mixing'' the two published signatures.
\subsubsection{Winternitz Signatures}
An improvement to basic Lamport signatures as described above are Winternitz signatures as detailed in
\textcite{merkle01} and \textcite{dods01}. Winternitz signatures reduce public key length as well as signature length
for hash length $n$ from $2n$ to $\mathcal O \left(n/t\right)$ for some choice of parameter $t$ (usually a small number
such as 4).
\paragraph{Setup.} The signer generates a private key $s = \left(s_i\right)$ consisting of $\ceil{\frac{n}{t}}$ random
bit strings. The signer publishes a public key $p = \left(H^{2^t}\left(s_i\right)\right)$ where each element
$H^{2^t}\left(s_i\right)$ is the $2^t$-fold recursive application of $H$ to $s_i$.
\paragraph{Signing.} The signer splits $m$ padded to a multiple of $t$ bits into $\ceil{\frac{n}{t}}$ chunks $m_i$ of
$t$ bit each. The signer publishes the signature $\sigma = \left( \sigma_i = H^{m_i}\left(s_i\right) \right)$.
\paragraph{Verification.} The verifier can calculate for each $\sigma_i = H^{m_i}\left(s_i\right)$ that $H^{2^t -
m_i}\left(\sigma_i\right) = H^{2^t - m_i}\left(H^{m_i}\left(s_i\right)\right) = H^{2^t - m_i + m_i} \left(s_i\right) =
p_i$.
To prevent an attacker from forging additional signatures from one signature by calculating $\sigma_i' =
H\left(\sigma_i\right)$ matching $m_i' = m_i + 1$, this scheme is usually paired with a simple checksum as described in
\textcite{merkle01}.
\subsubsection{Using hash-based signatures for trigger authentication}
The most basic possible trigger authentication scheme would be to simply generate a random bit string secret key $s$ and
publish $p = H(s)$ for some hash function $H$. To activate the trigger, $\sigma = s$ would be published and listeners
could verify that $H(\sigma) = p = H(s)$. This simplistic scheme has one main disadvantage: It is a fundamentally
one-time construction. To prevent an attacker from re-triggering a listener a second time by replaying a valid trigger
$\sigma$ all listeners have to blacklist any ``used'' $\sigma$. Alas, this means we can only ever trigger a listener
\emph{once}. The good part is that any listener that missed this trigger can still be triggered later, but the bad part
is that once $s$ is burned we are out of options. The trivial solution to this would be to simply inform each listener
with a whole list of public keys in advance. This however takes $n$ times the amount of space for $n$-fold
retriggerability. Luckily we can easily derive a scheme that yields $n$-fold retriggerability while using no more same
space than the original scheme by taking some inspiration from Winternitz signatures above.
In this scheme the secret key $s$ is still a random bit string. The public key is $p = H^n(s)$ for n-times
retriggerability. The $i$-th time the trigger is activated, $\sigma_i = H^n-i(s)$ is published, and every listener can
verify that $\sigma_{i-1} = H\left(\sigma_i\right)$ with $\sigma_0 = p$. In case a listener missed one or more previous
triggers it can simply continue computing $H\left(H\left(\sigma_i\right)\right)$ and
$H\left(H\left(H\left(\sigma_i\right)\right)\right)$ until either reaching the $n$-th recursion level (indicating an
invalid signature) or finding $H^n\left(\sigma_i\right) = \sigma_j$ with $sigma_j$ being the last signature this
listener recorded, or $p$ in case there is none.
This scheme provides replay protection through listeners memorizing the last signature they activated to. Public key
length is equal to the length of the hash function $H$ used. Even for our embedded systems use case $n$ can
realistically be up to $\mathcal O\left(10^3\right)$, which is easily enough for our application.
% FIXME here and in previous ~2 pages get transmitter/receiver and sender/listener terminology straight. Also perhaps do
% some sort of scenario definition introducing those terms somewhere.
\chapter{Practical implementation}
\section{Cryptographic validation}
%FIXME
\section{Data collection for channel validation}
To design a solid system we needed to parametrize mains frequency variations under normal conditions. To set modulation
amplitude as well as parameters of our modulation scheme we need a frequency spectrum of mains frequency variations
(that is $\mathcal F\left(f(V(t))\right)$: Taking mains frequency $f(x)$ as a variable, the frequency spectrum of that
variable, as opposed to the frequency spectrum of mains voltage $V(t)$ itself).
\subsection{Grid Frequency Estimation}
\label{frequency_estimation}
In commercial power systems Phasor Measurement Units (PMUs) are used to precisely measure parameters of a mains voltage
waveform. One of the parameters PMUs measure is mains frequency. PMUs are used as part of SCADA systems controlling
transmission networks to characterize the operational state of the network.
From a superficial viewpoint measuring mains frequency might seem like a simple problem. Take the mains voltage
waveform, measure time between two rising-edge (or falling-edge) zero-crossings and take the inverse $f = t^{-1}$. In
practice, phasor measurement units are significantly more complex than this. This discrepancy is due to the unhealthy
% FIXME is this pun ok?
combination of both high precision and quick response that is demanded from these units. High precision is necessary
since variations of mains frequency under normal operating conditions are quite small--in the range of
\SIrange{5}{10}{\milli\hertz} over short intervals of time. Relative to the nominal \SI{50}{\hertz} this is a derivation of
less than \SI{100}{ppm} Relative to the corresponding \SI{20}{\milli\second} period that means a time derivation of
about $2 \mu\text{s}$ from cycle to cycle. From this it is already obvious why a simplistic measurement cannot yield the
required precision for manageable averaging times--we would need either a ADC sampling rate in the order of megabits or
for a reconstruction through interpolated readings an impractically high ADC resolution.
Detail on the inner workings of commercial phasor measurement units is scarce but given their essential role to SCADA
systems there is a large amount of academic research on such algorithms\cite{narduzzi01,derviskadic01}. A popular
approach to these systems is to perform a Short-Time Fourier Transform (STFT) on ADC data sampled at high sampling rate
(e.g. \SI{10}{\kilo\hertz}) and then perform some analysis on the frequency-domain data to precisely locate the strong peak
around \SI{50}{\hertz}. A key observation here is that FFT bin size is going to be much larger than required frequency
resolution. This fundamental limitiation follows from the nyquist criterion %FIXME maybe cite? and if we had to process
an \emph{arbitrary} signal this would highly limit our practical measurement accuracy
\footnote{
Some software packages providing FFT or STFT primitives such as scipy\cite{virtanen01} allow the user to
super-sample FFT output by specifying an FFT width larger than input data length, padding the input data with zeros
on both sides. Note that in line with Nyquist this \emph{does not} actually provide finer output resolution but
instead just amounts to an interpolation between output bins. Depending on the downstream analysis algorithm it may
still be sensible to use this property of the DFT for interpolation, but in general it will be computationally
expensive compared to other interpolation methods and in any case it will not yield any better frequency resolution
aside from a hypothetical numerical advantage\cite{gasior02}.
}.
For this reason all approaches to mains frequency estimation are based on a model of the mains voltage waveform.
Nominally, this waveform would be a perfect sine at $f=\SI{50}{\hertz}$. In practice it is a sine at
$f\approx\SI{50}{\hertz}$ superimposed with some aperiodic noise (e.g. irregular spikes from inductive loads being
energized) as well as harmonic distortion that is caused by grid-topologically nearby devices with power factor
\footnote{
Power factor is a power engineering term that is used to describe how close the current waveform of a load is to
that of a purely resistive load. Given sinusoidal input voltage $V(t) = V_\text{pk} \sin \paren{\omega_\text{nom}
t}$ with $\omega_\text{nom} = 2 \pi f_\text{nom} = 2 \pi \cdot \SI{50}{\hertz}$ being the nominal angular frequency,
the current waveform of a resistor with resistance $R \left[\Omega\right]$ according to Ohm's law would be $I(t) =
\frac{V(t)}{R} = \frac{1}{R} V_\text{pk} \sin\paren{\omega_\text{nom} t}$. In this case voltage and current are
perfectly in phase, i.e. the current at time $t$ is linear in voltage at constant factor $\frac{1}{R}$.
In contrast to this idealized scenario reality provides us with two common issues: One, the load may be reactive.
This means its current waveform is an ideal sinusoid, but there is a phase difference between mains voltage and load
current like so: $I(t) = \frac{V(t)}{R} = \frac{1}{\left|Z\right|} V_\text{pk} \sin\paren{\omega_\text{nom} t +
\varphi}$ $Z$ would be the load's complex impedance combining inductive, capacitive and resistive components and
$\varphi$ the phase difference between the resulting current waveform and the mains voltage waveform. A common case
of such loads are motors and the inductive ballasts in old fluorescent lighting fixtures.
The second potential issue are loads with non-sinusoidal current waveform. There are many classes of these but the
most common one are switching-mode power supplies. Most SMPS for modern electronic devices have an input stage
consisting of a bridge rectifier followed by a capacitor that provide high-voltage DC power to the following
switch-mode convert circuit. This rectifier-capacitor input stage under normal load draws a high current only at the
very peak of the input voltage sinusoid and draws almost zero current for most of the period.
These two cases are measured by \emph{displacement power factor} and \emph{distortion power factor} that when
combined yield the overall true power factor. The power factor is a key quantity in the design and operation of the
power grid since a high power factor (close to $1.0$ or an in-phase sinusoidal current waveform) yields lowest
transmission and generation losses.
}
$\cos \theta \neq 1.0$. Under a continous fourier transform over a long period the frequency spectrum of a signal
distorted like this will be a low noise floor depending mainly on aperiodic noise on which a comb of harmonics as well
as some sub-harmonics of $f \approx f_\text{nom} = \SI{50}{\hertz}$ rides. The main peak at $f \approx f_\text{nom}$
will be very strong with the harmonics being approximately an order of magnitude weaker in energy and the noise floor
being at least another order of magnitude weaker. See figure \ref{mains_voltage_spectrum} for a measured spectrum. This
domain knowledge about the expected frequency spectrum of the signal can be employed in a number of interpolation
techniques to re-construct the precise frequency of the spectrum's main component despite comparatively coarse STFT
resolution and despite numerous distortions.
\begin{figure}
\centering
\includegraphics{../lab-windows/fig_out/mains_voltage_spectrum}
\caption{Fourier transform of a 24 hour capture of mains voltage. Data was captured using our frequency measurement
sensor described in section \ref{sec-fsensor} and FFT'ed after applying a blackman window. Vertical lines indicate
\SI{50}{\hertz} and odd harmonics.}
\label{mains_voltage_spectrum}
\end{figure}
Published grid frequency estimation algorithms such as \textcite{narduzzi01} or \textcite{derviskadic01} are rather
sophisticated and use a combination of techniques to reduce numerical errors in FFT calculation and peak fitting. Given
that we do not need reference standard-grade accuracy for our application we chose to start with a very basic algorithm
instead. We chose to use a general approach developed by experimental physicists at CERN that is described by
\textcite{gasior01}. This approach assumes a general sinusoidal signal superimposed with harmonics and broadband noise.
Applicable to a wide spectrum of practical signal analysis tasks it is a reasonable first-degree approximation of the
much more sophisticated estimation algorithms developed specifically for power systems. Some algorithms have components
such as kalman filters\cite{narduzzi01} that require a phyiscal model. As a general algorithm from \textcite{gasior01}
does not require this kind of application-specific tuning, eliminating one source of error.
\subsection{Frequency sensor hardware design}
\label{sec-fsensor}
Our safety reset controller % FIXME is this the right term?
will have to measure mains frequency to later demodulate a reset signal transmitted through it. Since we have decided to
do our own frequency measurement system here we can use this frequency measurement setup as a prototype for the
frequency measurement subcomponent of the demodulation system we will later develop. Since we do not plan to do a
large-scale field deployment of our measurement setup we can keep the hardware implementation simple by moving most of
the signal processing to a regular computer and concentrating our hardware efforts on raw signal capture.
\begin{figure}
\begin{center}
\begin{tikzpicture}[start chain = going below, node distance = 12mm and 50mm, every join/.style = {norm}]
\tikzset{
base/.style = {draw, on chain, on grid, align=center, minimum height = 4ex, font=\footnotesize},
text/.style = {base},
component/.style = {base, rectangle, text width=40mm},
coord/.style = {coordinate, on chain, on grid, node distance=6mm and 25mm}
}
\node[text centered] (input) {Single-Phase Mains Input};
\node[component] (safety) [below = of input] {Input Protection};
\node[coord] (safety-anchor) [below = of safety] {};
\node[component] (analog) [below = of safety-anchor] {Analog Signal Processing};
\node[component] (powersupply) [left = of analog] {Power supply};
\node[component] (adc) [below = of analog] {ADC};
\node[component] (micro) [below = of adc] {Microcontroller};
\node[component] (isol) [below = of micro] {Galvanic Digital Isolation};
\node[coord] (isol-left) [left = 6cm of isol.west] {};
\node[coord] (isol-right) [right = 1cm of isol.east] {};
\node[component] (usb) [below = of isol] {USB interface};
\draw[->] (input.south) -- (safety.north);
\draw[-] (safety.south) -- (safety-anchor);
\draw[->] (safety-anchor) -| (powersupply.north);
\draw[->] (safety-anchor) -| (analog.north);
\draw[->] (powersupply.south) |- (adc.west);
\draw[->] (powersupply.south) |- (micro.west);
\draw[->] (analog.south) -- (adc.north);
\draw[->] (adc.south) -- (micro.north);
\draw[->] (micro.south) -- (isol.north);
\draw[->] (isol.south) -- (usb.north);
\draw[dashed] (isol.west) -- (isol-left.east);
\draw[dashed] (isol.east) -- (isol-right.west);
\end{tikzpicture}
\end{center}
\caption{Frequency sensor hardware diagram}
\label{fmeas-sens-diag}
\end{figure}
An overall block diagram of our system is shown in fig. \ref{fmeas-sens-diag}. The mircrocontroller we chose is an
\texttt{STM32F030F4P6} ARM Cortex-M0 microcontroller made by ST Microelectronics. The ADC in fig. \ref{fmeas-sens-diag}
in our design is the integrated 12-bit ADC of this microcontroller, which is sufficient for our purposes. The USB
interface is a simple USB to serial converter IC (\texttt{CH340G}) and the galvanic digital isolation is accomplished
with a pair of high-speed optocouplers on its \texttt{RX} and \texttt{TX} lines. The analog signal processing is a
simple voltage divider using high-power resistors to get the required creepage along with some high-frequency filter
capacitors and an op-amp buffer. The power supply is an off-the-shelf mains-input power module. The system is
implemented on a single two-layer PCB that is housed in an off-the-shelf industrial plastic case fitted with a printed
label and a few status lights on its front.
\subsection{Clock accuracy considerations}
Our measurement hardware will sample line voltage at some sampling rate $f_S$, e.g.\ \SI{1}{\kilo\hertz}. All downstream
processsing is limited in accuracy by the accuracy of $f_S$\footnote{
We are not considering the effects of clock jitter. We are highly oversampling the signal and the FFT done in our
downstream processing will eliminate small jitter effects leaving only frequency stability to worry about. }. We
generate our sampling clock in hardware by clocking the ADC from one of the microcontroller's timer blocks clocked from
the microcontroller's system clock. This means our ADC's sampling window will be synchronized cycle-accurate to the
microcontroller's system clock.
Our downstream measurement of mains frequency by nature is relative to our sampling frequency $f_S$. In the setup
described above this means we have to make sure our system clock is fairly stable. A frequency derivation of \SI{1}{ppm}
in our system clock causes a proportional grid frequency measurement error of $\Delta f = f_\text{nom} \cdot
10^{-6} = \SI{50}{\micro\hertz}$. In a worst-case where our system is clocked from a particularly bad crystal that exhibits
\SI{100}{ppm} of instabilities over our measurement period we end up with an error of \SI{5}{\milli\hertz}. This is well
within our target measurement range, so we need a more stable clock source. Ideally we want to avoid writing our own
clock conditioning code where we try to change an oscillators operating frequency to match some reference. Clock
conditioning algorithms are highly complex and in our case post-processing of measurement data and simply adding and
offset is simpler and less error-prone.
Our solution to these problems is to use a crystal oven\footnote{
A crystal oven is a crystal oscillator thermally coupled closely to a heater and temperature sensor and enclosed in
a thermally isolated case. The heater is controlled to hold the crystal oscillator at a near-constant temperature
some few ten degrees above ambient. Any ambient temperature variations will be absorbed by the temperature control.
This yields a crystal frequency that is almost completely unaffected by ambient temperature variations below the
oven temperature and whose main remaining instability is aging.
}as our main system clock source. Crystal ovens are expensive compared to ordinary crystal oscillators. Since any
crystal oven will be much more accurate than a standard room-temperature crystal we chose to reduce cost by using one
recycled from old telecommunications equipment.
To verify clock accuracy we routed an externally accessible SMA connector to a microcontroller pin that is routed to one
of the microcontroller's timer inputs. By connecting a GPS 1pps signal to this pin and measuring its period we can
calculate our system's Allan variance\footnote{
Allan variance is a measure of frequency stability between two clocks.
}, thereby measuring both clock stability and clock accuracy.
We ran a 4 hour test of our frequency sensor that generated the histogram shown in figure \ref{ocxo_freq_stability}.
These results show that while we get a systematic error of about \SI{10}{ppm} due to manufacturing tolerances the
random error at less than \SI{10}{ppb} is smaller than that of a room-temperature crystal oscillator by 3-4 orders of
magnitude. Since we are interested in grid frequency variations over time but not in the absolute value of grid
frequency the systematic error is of no consequence to us. The random error at \SI{3.66}{ppb} corresponds to a
frequency measurement error of about \SI{0.2}{\micro\hertz}, well below what we can achieve at reasonable sampling rates
and ADC resolution.
\begin{figure}
\centering
\includegraphics{../lab-windows/fig_out/ocxo_freq_stability}
\caption{OCXO Frequency derivation from nominal \SI{19.440}{\mega\hertz} measured against GPS 1pps}
\label{ocxo_freq_stability}
\end{figure}
\subsection{Firmware implementation}
The firmware uses one of the microcontroller's timers clocked from an external crystal oscillator to produce an
\SI{1}{\milli\second} tick that the internal ADC is triggered from for a sample rate of \SI{1}{\kilo sps}. Higher sample
rates would be possible but reliable data transmission over the opto-isolated serial interface might prove challenging
and \SI{1}{\kilo sps} corresponds to $20$ samples per cycle at $f_\text{nominal}$. This is $10\times$ nyquist and should
be plenty for accurate measurements.
The ADC measurements are read using DMA and written into a circular buffer. Using some DMA controller features this
circular buffer is split in back and front halves with one being written to and the other being read at the same time.
Buffer contents are moved from the ADC DMA buffer into a packet-based reliable UART interface as they come in. The UART
packet interface keeps two ringbuffers: One byte-based ringbuffer for transmission data and one ringbuffer pointer
structure that keeps track of ADC data packet boundaries in the byte-based ringbuffer. Every time a chunk of data is
available from the ADC the data is framed into the byte-based ringbuffer and the packet boundaries are logged in the
packet pointer ringbuffer. If the UART transmitter is idle at this time a DMA-backed transmission of the oldest packet
in the packet ringbuffer is triggered at this point. Data is framed using Consistent Overhead Byte Stuffing
(COBS)\footnote{
COBS is a framing technique that allows encoding $n$ bytes of arbitray data into exactly $n+1$ bytes with no embedded
$0$-bytes that can then be delimited using $0$-bytes. COBS is simple to implement and allows both one-pass decoding and
encoding. The encoder either needs to be able to read up to \SI{256}{\byte} ahead or needs a buffer of \SI{256}{\byte}.
COBS is very robust in that it allows self-synchronization. At any point a receiver can reliably synchronize itself
against a COBS data stream by waiting for the next $0$-byte. The constant overhead allows precise bandwidth and buffer
planning and provides constant, good efficiency close to the theoretical maximum.}\cite{cheshire01} along with a
CRC-32 checksum for error checking. When the host receives a new packet with a valid checksum it returns an
acknowledgement packet to the sensor. When the sensor receives the acknowledgement, the acknowledged packet is dropped
from the transmission packet ringbuffer. When the host detects an incorrect checksum it simply stays quiet and waits for
the sensor to resume with retransmission when the next ADC buffer has been received.
The serial interface logic presents most of the complexity of the sensor firmware. This complexity is necessary since
we need reliable, error-checked transmission to the host. Though rare, bit errors on a serial interface do happen and
data corruption is unacceptable. The packet-layer queueing on the sensor is necessary since the host is not a realtime
system and unpredictable latency spikes of several hundred milliseconds are possible.
The host in our recording setup is a Raspberry Pi 3 model B running a Python script. The Python script handles serial
communication and logs data and errors into an SQLite database file. SQLite has been chosen for its simple yet flexible
interface and its good tolerance of system resets due to unexpected power loss. Overall our setup performed adequately
with IO contention on the raspberry PI/linux side causing only 16 skipped sample packets over a 68-hour recording span.
\subsection{Frequency sensor measurement results}
\begin{figure}
\centering
\includegraphics{../lab-windows/fig_out/freq_meas_trace_24h}
\caption{Trace of grid frequency over a 24 hour window. One clearly visible feature are large positive and negative
transients at full hours. Times shown are UTC. Note that the european continental synchronous area that this
sensor is placed in covers several time zones which may result in images of daily load peaks appearing in 1 hour
intervals. Fig.\ \ref{freq_meas_trace_mag} contains two magnified intervals from this plot.}
\label{freq_meas_trace}
\end{figure}
\begin{figure}
\begin{subfigure}{\textwidth}
\centering
\includegraphics{../lab-windows/fig_out/freq_meas_trace_2h_1}
\caption{A 2 hour window around 00:00 UTC.}
\end{subfigure}
\begin{subfigure}{\textwidth}
\centering
\includegraphics{../lab-windows/fig_out/freq_meas_trace_2h_2}
\caption{A 2 hour window around 18:30 UTC.}
\end{subfigure}
\caption{Two magnified 2 hour windows of the trace from fig.\ \ref{freq_meas_trace}.}
\label{freq_meas_trace_mag}
\end{figure}
\begin{figure}
\centering
\includegraphics{../lab-windows/fig_out/mains_voltage_spectrum}
\caption{Power spectral density of the mains voltage trace in fig. \ref{freq_meas_trace}. We can see the expected
peak at \SI{50}{\hertz} along with smaller peaks at odd harmonics. We can also see a number of spurious tones both
between harmonics and at low frequencies, as well as some bands containing high noise energy around
\SI{0.1}{\hertz}. This graph demonstrates a high signal-to-noise ratio that is not very demanding on our frequency
estimation algorithm.
}
\label{mains_voltage_spectrum}
\end{figure}
\begin{figure}
\centering
\includegraphics[width=\textwidth]{../lab-windows/fig_out/freq_meas_spectrum}
\caption{Power spectral density of the 24 hour grid frequency trace in fig. \ref{freq_meas_trace} with some notable
peaks annotated with the corresponding period in seconds. The $\frac{1}{f}$ line indicates a pink noise spectrum.
Around a period of \SI{20}{\second} the PSD starts to fall off at about $\frac{1}{f^3}$ until we can make out some
bumps at periods around $2$ and \SI{3}{\second}. Starting at at around \SI{1}{Hz} we can see a white noise floor in
the order of \si{\micro\hertz^2\per\hertz}.
% TODO: where does this noise floor come from? Is it a fundamental property of the grid? Is it due to limitations of
% our measurement setup (such as ocxo stability/phase noise) ???
}
\label{freq_meas_spectrum}
\end{figure}
Captured raw waveform data is processed in the Jupyter Lab environment\cite{kluyver01} and grid frequency estimates are
extracted as described in sec. \ref{frequency_estimation} using the \textcite{gasior01} technique. Appendix
\ref{grid_freq_estimation_notebook} contains the Jupyter notebook we used for frequency measurement. In fig.\
\ref{freq_meas_feedback} we fed back to the frequency estimator its own output giving us an indication of its numerical
performance. The result was \SI{1.3}{\milli\hertz} of RMS noise over a \SI{3600}{\second} simulation time. This
indicates performance is good enough for our purposes. In addition to this we validated our algorithm's performance by
applying it to the test waveforms from \textcite{wright01}. In this test we got errors of \SI{4.4}{\milli\hertz} for the
\emph{noise} test waveform, \SI{0.027}{\milli\hertz} for the \emph{interharmonics} test waveform and
\SI{46}{\milli\hertz} for the \emph{amplitude and phase step} test waveform. Full results can be found in fig.\
\ref{freq_meas_rocof_reference}.
\begin{figure}
\centering
\includegraphics[width=\textwidth]{../lab-windows/fig_out/freq_meas_feedback}
\caption{
The frequency estimation algorithm applied to a synthetic noise-less mains waveform generated from its own
output. This feedback simulation gives an indication of numerical errors in our estimation algorithm. The top
four graphs show a comparison of the original trace (blue) and the re-calculated trace (orange). The bottom
trace shows the difference between the two. As we can tell both traces agree very well with an overall RMS
deviation of about \SI{1.3}{\milli\hertz}. The bottom trace shows deviation growing over time. This is very
likely an effect of numerical errors in our ad-hoc waveform generator.
}
\label{freq_meas_feedback}
\end{figure}
\begin{figure}
\centering
\includegraphics[width=\textwidth]{../lab-windows/fig_out/freq_meas_rocof_reference}
\caption{
Performance of our frequency estimation algorithm against the test suite specified in \textcite{wright01}. Shown
are standard deviation and variance measurements as well as time-domain traces of differences.
}
\label{freq_meas_rocof_reference}
\end{figure}
\section{Channel simulation and parameter validation}
\label{sec-ch-sim}
To validate all layers of our communication stack from modulation scheme to cryptography we built a prototype
implementation in python. Implementing all components in a high-level language builds up familiartiy with the concepts
while taking away much of the implementation complexity. For our demonstrator we will not be able to use python since
our target platform is a cheap low-end microcontroller. Our demonstrator firmware will have to be written in a low-level
language such as C or rust. For prototyping these languages lack flexibility compared to python.
% FIXME introduce project outline, specs -> proto -> demo above!
To validate our modulation scheme we first performed a series of simulations on our python demodulator prototype
implementation. To simulate a modulated grid frequency signal we added noise to a synthetic modulation signal. For most
simulations we used measured frequency data gathered with our frequency sensor. We only have a limited amount of capture
data. Re-using segements of this data as background noise in multiple simulation runs could hypothetically lead to our
simulation results depending on individual features of this particular capture that would be common between all runs. To
estimate the impact of this problem we re-ran some of our simulations with artificial random noise synthesized with a
power spectral density matching that of our capture. To do this, we first measured our capture's PSD, then fitted a
low-resolution spline to the PSD curve in log-log coordinates. We then generated white noise, multiplied the resampled
spline with the DFT of the synthetic noise and performed an iDFT on the result. The resulting time-domain signal is our
synthetic grid frequency data. Fig.\ \ref{freq_meas_spectrum} shows the PSD of our measured grid frequency signal. The
red line indicates the low-resolution log-log spline interpolation used for shaping our artificial noise. Fig.\
\ref{simulated_noise_spectrum} shows the PSD of our simulated signal overlayed with the same spline as a red line and
shows time-domain traces of both simulated (blue) and reference signals (orange) at various time scales. Visually both
signals look very similar, suggesting we have found a good synthetic approximation of our measurements.
\begin{figure}
\centering
\includegraphics[width=\textwidth]{../lab-windows/fig_out/simulated_noise_spectrum}
\caption{Synthetic grid frequency in comparison with measured data. The topmost graph shows the synthetic spectrum
compared to the spline approximation of the measured spectrum (red line). The other graphs show time-domain
synthetic data (blue) in comparison with measured data (orange).
}
\label{simulated_noise_spectrum}
\end{figure}
In our simulations, we manipulated four main variables of our modulation scheme and demodulation algorithm and observed
their impact on symbol error rate (SER):
\begin{description}
\item[Modulation amplitude.] Higher amplitude should correspond to a lower SER.
\item[Modulation bit count.] Higher bit count $n$ means longer transmissions but yields higher theoretical decoding
gain, and should increase demodulator sensitivity. Ultimately, we want to find a sweet spot of manageable
transmission length at good demodulator sensitivity.
\item[Decimation] or DSSS chip duration. The chip time determines where in the grid frequency spectrum (fig.\
\ref{freq_meas_spectrum} our modulated signal is located. Given our noise spectrum (fig.\
\ref{freq_meas_spectrum}) lower chip durations (shifting our signal upwards in the spectrum) should yield lower
in-band background noise which should correspond to lower symbol error rates.
\item[Demodulation correlator peak threshold factor.] The first step of our prototype demodulation algorithm is to
calculate the correlation between all $2^n+1$ Gold sequences
% FIXME add a \ref here, describe proto demod alg somewhere
and to identify peaks corresponding to the input data containing a correctly aligned Gold sequence. The
threshold factor is a factor peaks of what magnitude compared to baseline noise levels are considered in the
following maximum likelihood estimation (MLE) decoding. % FIXME do we actually do MLS?
\end{description}
As indicated by our results, symbol error rate is a good proxy of demodulation performance. With decreasing
signal-to-noise ratio, margins in various parts of the demodulator decrease which statistically leads to an increased
symbol error rate. Our simulations yield smooth, reproducible SER curves with adequately low error bounds. This
indicates SER is related fairly monotonically to the signal-to-noise margins inside our demodulator prototype.
\begin{figure}
\centering
\includegraphics{../lab-windows/fig_out/dsss_gold_nbits_overview}
\caption{
Symbol Error Rate (SER) as a function of transmission amplitude. The line indicates the mean of several
measurements for each parameter set. The shaded areas indicate one standard deviation from the mean. Background
noise for each trial is a random segment of measured grid frequency. Background noise amplitude is the same for
all trials. Shown are four traces for four different DSSS sequence lengths. Using a 5-bit gold code, one DSSS
symbol measures 31 chips. 6 bit per symbol are 63 chips, 7 bit are 127 chips and 8 bit 255 chips. This
simulation uses a decimation of 10, which corresponds to an $1 \text{s}$ chip length at our $10 \text{Hz}$ grid
frequency sampling rate. At 5 bit per symbol, one symbol takes $31 \text{s}$ and one bit takes $6.2 \text{s}$
amortized. At 8 bit one symbol takes $255 \text{s} = 4 \text{min} 15 \text{s}$ and one bit takes $31.9 \text{s}$
amortized. Here, slower transmission speed buys coding gain. All else being the same this allows for a decrease
in transmission power.
}
\label{dsss_gold_nbits_overview}
\end{figure}
\begin{figure}
\centering
\includegraphics{../lab-windows/fig_out/dsss_gold_nbits_sensitivity}
\caption{
Amplitude at a SER of 0.5\ in mHz depending on symbol length. Here we can observe an increase of sensitivity
with increasing symbol length, but we can clearly see diminishing returns above 6 bit (63 chips). Considering
that each bit roughly doubles overall transmission time for a given data length it seems lower bit counts are
preferrable if the necessary transmitter power can be realized.
}
\label{dsss_gold_nbits_sensitivity}
\end{figure}
\begin{figure}
\centering
\includegraphics{../lab-windows/fig_out/dsss_thf_amplitude_5678}
\caption{
SER vs.\ amplitude graph similar to fig.\ \ref{dsss_gold_nbits_overview} with dependence on threshold factor
color-coded. Each graph shows traces for a single DSSS symbol length.
}
\label{dsss_thf_amplitude_5678}
\end{figure}
\begin{figure}
\ContinuedFloat
\begin{subfigure}{\textwidth}
\centering
\includegraphics{../lab-windows/fig_out/dsss_thf_sensitivity_5678}
\label{dsss_thf_sensitivity_5678}
\caption{
\footnotesize Graphs of amplitude at $SER=0.5$ for each symbol length as well as asymptotic SER for large
amplitudes. Areas shaded red indicate that $SER=0.5$ was not reached for any amplitude in the simulated
range. We can observe that smaller symbol lengths favor lower threshold factors, and that optimal threshold
factors for all symbol lengths are between $4.0$ and $5.0$.
}
\end{subfigure}
\caption{
Dependence of demodulator sensitivity on the threshold factor used for correlation peak detection in our
DSSS demodulator. This is an empirically-determined parameter specific to our demodulation algorithm. At low
threshold factors our classifier yields lots of spurious peaks that have to be thrown out by our maximum
likelihood estimator. These spurious peaks have a random time distribution and thus do not pose much of a
challenge to our MLE but at very low threshold factors the number of spurious peaks slows down decoding and
does still clog our MLE's internal size-limited candidate lists which leads to failed decodings. At very
high threshold factors decoding performance suffers greatly since many valid correlation peaks get
incorrectly ignored. The glitches at medium threshold factors in the 7- and 8-bit graphs are artifacts of
our prototype decoding algorithm that we have not fixed in the prototype implementation since we wanted to
focus on the final C version.}
\label{dsss_thf_sensitivity}
\end{figure}
\begin{figure}
\begin{subfigure}{\textwidth}
\centering
\includegraphics[width=\textwidth]{../lab-windows/fig_out/chip_duration_sensitivity_5}
\label{chip_duration_sensitivity_5}
\caption{
5 bit Gold code
}
\end{subfigure}
\end{figure}
\begin{figure}
\ContinuedFloat
\begin{subfigure}{\textwidth}
\centering
\includegraphics[width=\textwidth]{../lab-windows/fig_out/chip_duration_sensitivity_6}
\label{chip_duration_sensitivity_6}
\caption{
6 bit Gold code
}
\end{subfigure}
\caption{
Dependence of demodulator sensitivity on DSSS chip duration. Due to computational constraints this simulation is
limited to 5 bit and 6 bit DSSS sequences. There is a clearly visible sensitivity maximum at fairly short chip
lengths around $0.2 \text{s}$. Short chip durations shift the entire transmission band up in frequency. In fig.\
\ref{freq_meas_spectrum} we can see that noise energy is mostly concentrated at lower frequencies, so shifting
our signal up in frequency will reduce the amount of noise the decoder sees behind the correlator by shifting
the band of interest into a lower-noise spectral region. For a practical implementation chip duration is limited
by physical factors such as the maximum modulation slew rate ($\frac{\text{d}P}{\text{d}t}$), the maximum
Rate-Of-Change-Of-Frequency (ROCOF, $\frac{\text{d}f}{\text{d}t}$) the grid can tolerate and possible inertial
effects limiting response of frequency to load changes at certain load levels.
% FIXME are these inertial effects likely? Ask an expert.
}
\label{chip_duration_sensitivity}
\end{figure}
\begin{figure}
\begin{subfigure}{\textwidth}
\centering
\includegraphics[width=\textwidth]{../lab-windows/fig_out/chip_duration_sensitivity_cmp_meas_6}
\label{chip_duration_sensitivity_cmp_meas_6}
\caption{
Simulation using baseline frequency data from actual measurements.
}
\end{subfigure}
\end{figure}
\begin{figure}
\ContinuedFloat
\begin{subfigure}{\textwidth}
\centering
\includegraphics[width=\textwidth]{../lab-windows/fig_out/chip_duration_sensitivity_cmp_synth_6}
\label{chip_duration_sensitivity_cmp_synth_6}
\caption{
Simulation using synthetic frequency data.
}
\end{subfigure}
\caption{
Chip duration/sensitivity simulation results like in fig.\ \ref{chip_duration_sensitivity} compared between a
simulation using measured frequency data like previous graphs and one using artificially generated noise. There
is almost no visible difference indicating that we have found a good model of reality in our noise synthesizer,
but also that real grid frequency behaves like a frequency-shaped gaussian noise process.
}
\label{chip_duration_sensitivity_cmp}
\end{figure}
\section{Implementation of a demonstrator unit}
%FIXME
To demonstrate the viability of our reset architecture we decided to implement a demonstrator system. In this
demonstrator we use JTAG to reset part of a commodity smart meter from an externally-connected reset controller. The
reset controller receives its commands over the grid frequency modulation system we outlined in this thesis. To keep
implementation cost low the reset controller is fed a simulation of a modulated grid frequency signal through a standard
\SI{3.5}{\milli\meter} audio jack\footnote{
By generously cutting two PCB traces the meter we chose to use can be easily modified to provide strong galvanic
separation between grid and main application microcontroller. With this modification we have to supply power to its
main application MCU externally along with the JTAG interface.
}.
\subsection{Selecting a smart meter for demonstration purposes}
For our demonstrator to make sense we wanted to select a realistic reset target. In Germany where this thesis was
written a standards-compliant setup would consist of a fairly dumb smart meter and a smart meter gateway (SMGW)
containing all of the complex bidirectional protocol logic such as wireless or landline IP connectivity. The realistic
target for a setup in this architecture would be the components of an SMGW such as its communications modem or main
application processor. In the German architecture the smart meter does not even have to have a bi-directional data link
to the SMGW effectively mitigating any attack vector for remote compormise.
Despite these considerations we still chose to reset the application MCU inside smart meter for two reasons. One is that
SMGWs are much harder to come by on the second-hand market. The other is that SMGWs are a particular feature of the
German standardization landscape and in many other countries the functions of an SMGW are integrated into the meter
itself. % FIXME citation
In the end we settled on an Q3DA1002 three-phase 60A meter made by German manufacturer EasyMeter. This meter is typical
of what would be found in an average German household and can be acquired very inexpensively as new old stock on online
marketplaces.
The meter consists of a plastic enclosure with a transparent polycarbonate top part and a grey ABS bottom part that are
ultrasonically welded shut. In the bottom part of the case a PCB we call the \emph{measurement} board is potted in
epoxide resin (see fig.\ \ref{easymeter_composites}). This PCB contains three separate energy measurement ASICs for the
three phases (see fig.\ \ref{easymeter_detail_xrays}). It also contains a capacitive dropper power supply for the meter
circuitry and external modules such as a SMGW. The measurement board through three infrared links (one per phase)
communicates with a smaller unpotted PCB we call the \emph{display} board in the top of the case. This PCB handles
measurement logging and aggregation, controls a small segment LCD displaying totals and handles the externally
accessible \si{\kilo\watt\hour} impulse LED and serial IR links.
The measurement board does not contain any logging or outside communication interfaces. All of that is handled on the
display board by a Texas Instruments MSP430F2350 application MCU. This is a 16-bit RISC MCU with \SI{16}{\kilo\byte}
flash and \SI{2}{\kilo\byte} SRAM\footnote{
The microcontroller might seem a bit overkill for such a simple application, but most of its \SI{16}{\kilo\byte}
program flash is in fact used. A casual glance with Ghidra shows that a large part of program flash is expended on
keeping multiple redundant copies of energy consumption aggregates including error recovery in case of data
corruption and some effort has even been made to guard against data corruption using simple non-cryptographic
checksums. Another large part of the MCU's firmware handles data transmission over the meter's externally accessible
IR link through Smart Message Language\cite{bsi-tr-03109-1-IVb}.
}. There is an I2C EEPROM that is used in conjunction with the microcontroller's internal \SI{256}{\byte} data flash to
keep redundant copies of energy consumption aggregates. On the side of the base board is a 14-pin header containing both
a standard TI MSP430 JTAG pinout and an UART serial link for debugging. Conveniently the JTAG port was left enabled by
fuse in our particular production unit.
We chose to use this MSP430 series application MCU as our reset target. Though in this particular unit compromise is
impossible due to a lack of bi-directional communication links some of its sister models do contain bidirectional
communication links\cite{easymeter01} making compromise through communication interfaces at least a theoretical
possibility. In other countries meters with a similar architecture to the Q3DA1002 commonly include complex protocol
logic as part of the meter itself\cite{honeywell01,ifixit01}. As an example, the Honeywell REX2 uses a Maxim Integrated
71M6541 main application microcontroller along with a Texas Instruments CC1000 series radio transceiver and is
advertised to support both over-the-air firmware upgrades and a remotely accessible ``service control switch''.
\begin{figure}
\centering
\begin{subfigure}{\textwidth}
\centering
\includegraphics[width=0.6\textwidth]{resources/easymeter_board_composite.jpg}
\label{easymeter_display_board_composite}
\caption{
\footnotesize
Optical composite image of the display and data logging board in the top of the case. The six pins at the
top are the SPI chip-on-glass segment LCD. Of the eight pads on the left six are unused and two carry the
auxiliary power supply from the measurement board below. The bottom right section contains the
\si{\kilo\watt\hour} impulse LED and the angled IR communication LED. The flying wires
connect to the 14-pin JTAG and serial debug header.
}
\end{subfigure}
\begin{subfigure}{\textwidth}
\vspace{1cm}
\centering
\includegraphics[width=0.8\textwidth]{resources/easymeter_baseboard_composite.jpg}
\label{easymeter_measurement_board_composite}
\caption{
\footnotesize
Composite microfocus x-ray image of the potted measurement module in the bottom of the case. The ovals on
the top left and right are power supply and data jumper connections for external modules such as SMGW
interfaces. The bright parts at the bottom are the massive screw terminals with integrated current shunts.
The circuitry right of the three independent measurement channels is the power supply circuit for the
display board.
}
\end{subfigure}
\caption{
Composite images of the circuit boards inside the EasyMeter Q3DA1002 "smart" electricity meter used in our
demonstration.
}
\label{easymeter_composites}
\end{figure}
\begin{figure}
\centering
\begin{subfigure}{0.45\textwidth}
\centering
\includegraphics[width=\textwidth]{resources/easymeter_baseboard_channel.jpg}
\label{easymeter_channel_xray}
\caption{Microfocus x-ray of one channel's data acquisition circuit}
\end{subfigure}\hspace*{5mm}
\begin{subfigure}{0.45\textwidth}
\centering
\includegraphics[width=\textwidth]{resources/easymeter_baseboard_powersupply.jpg}
\label{easymeter_powersupply_xray}
\caption{Microfocus x-ray of the auxiliary power supply}
\end{subfigure}
\caption{
Microfocus x-rays of major sections of the EasyMeter Q3DA1002 measurement board
}
\label{easymeter_detail_xrays}
\end{figure}
\subsection{Firmware implementation}
We based our safety reset demonstrator firmware on the grid frequency sensor firmware we developed in sec.\
\ref{sec-fsensor}. We implemented DSSS demodulation by translating the python prototype code we developed in sec.\
\ref{sec-ch-sim} to embedded C code. After validating the C translation in extensive simulations we integrated our code
with a reed-solomon implementation and a libsodium-based implementation of the cryptographic protocol we designed in
sec.\ \ref{sec-crypto}. % FIXME WIP
To reprogram the target MSP430 microcontroller we ported over the low-level bitbang JTAG driver of
mspdebug\footnote{\url{https://github.com/dlbeer/mspdebug}}.
For all computation-heavy high-level modules of our firmware such as the DSSS demodulator or the grid frequency
estimator we wrote test fixtures that allow the same code that runs on the microcontroller to be executed on the host
for testing. These test fixtures are very simple C programs that load input data from a file or the command line, run
the algorithm and print results on standard output.
\section{Grid frequency modulation emulation}
To emulate a modulated grid frequency signal we superimposed a DSSS-modulated signal at the proper amplitude with
synthetic grid frequency noise generated according to the measurements we took in sec. \ref{sec-fsensor}. In this
primitive simulation we do not simulate the precise impulse response of the grid to a DSSS-modulated stimulus signal.
Our results still serve to illustrate the possibility of data transmission in this manner this impulse response can be
compensated for at the transmitter by selecting appropriate modulation parameters (e.g. chip rate and amplitude) and at
the receiver by equalization with a matched filter.
\section{Experimental results}
% FIXME
\section{Lessons learned}
Before settling on the commercial smart meter we first tried to use an EVM430-F6779 smart meter evaluation kit made by
Texas Instruments. This evaluation kit did not turn out well for two main reasons. One, it shipped with half the case
missing and no cover for the terminal blocks. Because of this some work was required to maintain electrical safety.
Even after mounting it in an electrically safe manner since the main MCU is not isolated from the grid and the JTAG port
is also galvanically coupled the safety reset controller prototype would also have to be galvanically isolated to not
pose an electrical safety risk. The second issue we ran into was that the EVM430-F6779 is based around an MSP430F6779
microcontroller. This microcontroller is a rather large part within the MSP430 series and uses a particularly new
revision of the CPU core and associated JTAG peripheral that are incompatible with all MSP430 programmers we tried to
use on it. mspdebug does not have support for it and porting TI's own JTAG programmer reference sources did not yield
any results either. Finally we tried an USB-based programmer made by TI themselves that turned out to either have broken
firmware or a hardware defect, leading to it frequently re-enumerating on the USB.
Overall our initial assumption that a development kit would certainly be easier to program than a commercial meter did
not prove to be true. Contrary to our expectations the commercial meter had JTAG enabled allowing us to easily read out
its stock firmware without needing to reverse-engineer vendor firmware update files or circumventing code protection
measures. The fact that its firmware was only available in its compiled binary form was not much of a hindrance as it
proved not to be too complex and all we wanted to know could be found out with just a few hours of digging in Ghidra.
In the firmware development phase our approach of testing every module individually (e.g. DSSS demodulator, Reed-Solomon
decoder, grid frequency estimation) proved to be very useful. In particular debugging benefited greatly from being able
to run a couple thousand tests within seconds. In case of our DSSS demodulator this modular testing and simulation
architecture allowed us to simulate many thousand runs of our implementation on test data and directly compare it to our
Jupyter/Python prototype (see fig.\ \ref{fw_proto_comparison}). Since we spent more time polishing our embedded C
implementation it turned out to perform much better than our initial python prototype. At the same time it shows
fundamentally similar response to its parameters. One significant bug we fixed in the embedded C version is the python
version's tendency towards incorrect decodings at even very large amplitudes.
\begin{figure}
\centering
\begin{subfigure}{\textwidth}
\centering
\includegraphics[trim={0 4cm 0 0},clip]{../lab-windows/fig_out/dsss_thf_amplitude_56_jupyter_impl}
\caption{Python prototype}
\end{subfigure}
\begin{subfigure}{\textwidth}
\centering
\includegraphics[trim={0 4cm 0 0},clip]{../lab-windows/fig_out/dsss_thf_amplitude_56_fw_impl}
\caption{Embedded C implementation}
\end{subfigure}
\caption{
Symbol error rate plots versus threshold factor for both our python prototype (above) and our firmware
implementation of our demodulation algorithm. Note the slightly different threshold factor color scales. Cf.\
fig.\ \ref{dsss_thf_amplitude_5678}.
}
\label{fw_proto_comparison}
\end{figure}
In accordance with our initial estimations we did not run into any code space nor computation bottlenecks for chosing
floating-point emulation instead of porting over our algorithms to fixed-point calculations. The extremely slow sampling
rate of our systems makes even heavyweight processing such as FFT or our rather brute-force dynamic programming approach
to DSSS demodulation possible well within performance constraints.
Compiled code size of our firmware implementation is slightly larger than we would like at around \SI{64}{\kilo\byte}
for our firmware image including everything except the target microcontroller firmware image. See appendix
\ref{symbol_size_chart} for a graph illustrating the contribution of various parts of the signal processing toolchain to
this total. Overall the most heavy-weight operations by far are the SHA512 implementation from libsodium and the FFT
from ARM's CMSIS signal processing library.
\chapter{Future work}
\section{Technical standardization}
The description of a safety reset system provided in this work could be translated into a formalized technical standard
with relatively low effort. Our system is very simple compared to e.g. a full smart meter communication standard and
thus can conceivably be described in a single, concise document. The much more complicated side of standardization would
be the standardization of the backend operation including key management, coordination and command authorization.
\section{Regulatory adoption}
Since the proposed system adds significant cost and development overhead at no immediate benefit to either consumer or
utility company it is unlikely that it would be adopted voluntarily. Market forces limit what long-term planning utility
companies can do. An advanced mitigation such as this one might be out of their reach on their own and might require
regulatory intervention to be implemented. To regulatory authorities a system such as this one provides a powerful
primitive to guard against attacks. Due to the low-level approach our system might allow a regulatory authority to
restore meters to a safe state without the need of fine-grained control of implementation details such as application
network protocols.
A regulatory authority might specify that all smart meters must use a standardized reset controller that on command
resets to a minimal firmware image that disables external communication, continues basic billing functions and enables
any disconnect switches. This system would enable the \emph{reset authority} to directly preempt a large-scale attack
irrespective of implementation details of the various smart meter implementations.
Cryptographic key management for the smart reset system is not much different to the management of highly privileged
signing keys as they are used in many other systems already. If the safety reset system is implemented with a
regulatory authority as the \emph{reset authority} they would likely be able to find a public entity that is already
managing root keys for other government systems to also manage safety reset keys. Availability and security requirements
of safety reset keys do not differ significantly from those for other types of root keys.
\section{Practical implementation}
%FIXME
\section{Zones of trust}
In our design, we opted for a safety reset controller
% FIXME is "safety reset" the proper name here? We need some sort of branding, but is this here really about "safety"?
in form of a separate micocontroller entirely separate from whatever application microcontroller the smart meter design
is already using. This design nicely separates the meter into an untrusted application (the core microcontroller) and
the trusted reset controller. Since the interface between the two is simple and logically one-way, it can be validated
to a high standard of security.
Despite these security benefits, the cost of such a separate hardware device might prove high in a mass-market rollout.
In this case, one might attempt to integrate the reset controller into the core microcontroller in some way. Primarily,
there would be two ways to accomplish this. One is a solution that physically integrates an additional microcontroller
core into the main application microcontroller package either as a submodule on the same die or as a separate die in a
multi-chip module (MCM) with the main application microcontroller. A full-custom solution integrating both on a single
die might be a viable path for very large-scale deployments, but will most likely be too expensive in tooling costs
alone to justify its use. More likely for a medium- to large-scale deployment (millions of meters) would be a MCM
integrating an off-the-shelf smart metering microcontroller die with the reset controller running on another, much
smaller off-the-shelf microcontroller die. This solution might potentially save some cost compared to a solution using a
discrete microcontroller for the reset controller.
The more likely approach to reducing cost overhead of the reset controller would be to employ virtualization
technologies such as ARM's TrustZone in order to incorporate the reset controller firmware into the application firmware
on the same chip without compromising the reset controller's security or disturbing the application firmware's
operation.
TrustZone is a virtualization technology that provides a hardware-assisted privileged execution domain on at least one
of the microcontrollers cores. In traditional virtualization setups a privileged hypervisor is managing several
unprivileged applications sharing resources between them. Separation between applications in this setup is longitudinal
between adjacent virtual machines. Two applications would both be running in unprivileged mode sharing the same cpu and
the hypervisor would merely schedule them, configure hardware resource access and coordinate communication. This
longitudinal virtualization simplifies application development since from the application's perspective the virtual
machine looks very similar to a physical one. In addition, in general this setup reciprocally isolates two applications
with neither one being able to gain control over the other.
In contrast to this, a TrustZone-like system in general does not provide several application virtual machines and
longitudinal separation. Instead, it provides lateral separation between two domains: The unprivileged application
firmware and a privileged hypervisor. Application firmware may communicate with the hypervisor through defined
interfaces but due to TrustZone's design it need not even be aware of the hypervisor's existence. This makes a perfect
fit for our reset controller. The reset controller firmware would be running in privileged mode and without exposing any
communication interfaces to application firmware. The application firmware would be running in unprivileged mode
without any modification. The main hurdles to the implementation to a system like this are the requirement for a
microcontroller providing this type of virtualization on the one hand and the complexity of correctly employing this
virtualization on the other hand. Virtualization systems such as TrustZone are still orders of magnitude more complex to
correctly configure than it is to simply use separate hardware and secure the interfaces in between.
\chapter{Alternative use of grid frequency modulation}
% FIXME random beacons? funky consensus protocols? proof of knowledge/cryptographic notary service?
\chapter{Conclusion}
%FIXME
\newpage
\appendix
\chapter{Acknowledgements}
%FIXME
\newpage
\chapter{References}
\nocite{*} % FIXME
\printbibliography
\newpage
\chapter{Transcripts of Jupyter notebooks used in this thesis}
%\includenotebook{Grid frequency estimation}{grid_freq_estimation}
%\includenotebook{Grid frequency estimation validation against ROCOF test suite}{freq_meas_validation_rocof_testsuite}
%\includenotebook{Frequency sensor clock stability analysis}{gps_clock_jitter_analysis}
%\includenotebook{DSSS modulation experiments}{dsss_experiments-ber}
\chapter{Demonstrator Resources}
\section{schematics and code}
% FIXME
\chapter{Demonstrator Firmware Symbol Sizes}
\label{symbol_size_chart}
\includepdf[fitpaper]{resources/safetyreset-symbol-sizes.pdf}
\chapter{Economic viability of countermeasures}
\section{Attack cost}
\section{Countermeasure cost}
% FIXME maybe include a standard for the technical side of a safety reset system here, e.g. in the style of an IETF draft?
\end{document}
|