-
Notifications
You must be signed in to change notification settings - Fork 26
/
txn.cc
1576 lines (1463 loc) · 61.4 KB
/
txn.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include "macros.h"
#include "txn.h"
#include "dbcore/rcu.h"
#include "dbcore/sm-rep.h"
#include "dbcore/serial.h"
#include "ermia.h"
namespace ermia {
transaction::transaction(uint64_t flags, str_arena &sa)
: flags(flags), sa(&sa) {
if (!(flags & TXN_FLAG_CMD_REDO) && config::is_backup_srv()) {
// Read-only transaction on backup - grab a begin timestamp and go.
// A read-only 'transaction' on a backup basically is reading a
// consistent snapshot back in time. No CC involved.
static thread_local TXN::xid_context *ctx = nullptr;
if (!ctx) {
ctx = TXN::xid_get_context(TXN::xid_alloc());
}
xc = ctx;
RCU::rcu_enter();
xc->begin = rep::GetReadView();
ASSERT(xc->begin);
xc->xct = this;
} else {
initialize_read_write();
}
}
void transaction::initialize_read_write() {
if (config::phantom_prot) {
masstree_absent_set.set_empty_key(NULL); // google dense map
masstree_absent_set.clear();
}
GetWriteSet().clear();
#if defined(SSN) || defined(SSI) || defined(MVOCC)
GetReadSet().clear();
#endif
xid = TXN::xid_alloc();
xc = TXN::xid_get_context(xid);
xc->begin_epoch = MM::epoch_enter();
xc->xct = this;
#if defined(SSN) || defined(SSI)
// If there's a safesnap, then SSN treats the snapshot as a transaction
// that has read all the versions, which means every update transaction
// should have a initial pstamp of the safesnap.
//
// Readers under SSI using safesnap are free from SSI checks, but writers
// will have to see whether they have a ct3 that's before the safesnap lsn
// (ie the safesnap is T1, updater is T2). So for SSI updaters also needs
// to take a look at the safesnap lsn.
// Take a safe snapshot if read-only.
if (config::enable_safesnap and (flags & TXN_FLAG_READ_ONLY)) {
ASSERT(MM::safesnap_lsn);
xc->begin = volatile_read(MM::safesnap_lsn);
log = NULL;
} else {
TXN::serial_register_tx(xid);
RCU::rcu_enter();
log = logmgr->new_tx_log();
// Must +1: a tx T can only update a tuple if its latest version was
// created before T's begin timestamp (i.e., version.clsn < T.begin,
// note the range is exclusive; see first updater wins rule in
// oid_put_update() in sm-oid.cpp). Otherwise we risk making no
// progress when retrying an aborted transaction: everyone is trying
// to update the same tuple with latest version stamped at cur_lsn()
// but no one can succeed (because version.clsn == cur_lsn == t.begin).
xc->begin = logmgr->cur_lsn().offset() + 1;
#ifdef SSN
xc->pstamp = volatile_read(MM::safesnap_lsn);
#elif defined(SSI)
xc->last_safesnap = volatile_read(MM::safesnap_lsn);
#endif
}
#elif defined(MVOCC)
RCU::rcu_enter();
log = logmgr->new_tx_log();
xc->begin = logmgr->cur_lsn().offset() + 1;
#else
// SI - see if it's read only. If so, skip logging etc.
RCU::rcu_enter();
log = (flags & TXN_FLAG_READ_ONLY) ? nullptr : logmgr->new_tx_log();
xc->begin = logmgr->cur_lsn().offset() + 1;
#endif
}
transaction::~transaction() {
if (config::is_backup_srv() && !(flags & TXN_FLAG_CMD_REDO)) {
RCU::rcu_exit();
return;
}
// transaction shouldn't fall out of scope w/o resolution
// resolution means TXN_CMMTD, and TXN_ABRTD
ASSERT(state() != TXN::TXN_ACTIVE && state() != TXN::TXN_COMMITTING);
#if defined(SSN) || defined(SSI)
if (not config::enable_safesnap or (not(flags & TXN_FLAG_READ_ONLY)))
RCU::rcu_exit();
#else
RCU::rcu_exit();
#endif
#if defined(SSN) || defined(SSI)
if (not config::enable_safesnap or (not(flags & TXN_FLAG_READ_ONLY)))
TXN::serial_deregister_tx(xid);
#endif
if (config::enable_safesnap and flags & TXN_FLAG_READ_ONLY)
MM::epoch_exit(0, xc->begin_epoch);
else
MM::epoch_exit(xc->end, xc->begin_epoch);
TXN::xid_free(xid); // must do this after epoch_exit, which uses xc.end
}
void transaction::Abort() {
// Mark the dirty tuple as invalid, for oid_get_version to
// move on more quickly.
volatile_write(xc->state, TXN::TXN_ABRTD);
#if defined(SSN) || defined(SSI)
// Go over the read set first, to deregister from the tuple
// asap so the updater won't wait for too long.
auto &read_set = GetReadSet();
for (uint32_t i = 0; i < read_set.size(); ++i) {
auto &r = read_set[i];
ASSERT(r->GetObject()->GetClsn().asi_type() == fat_ptr::ASI_LOG);
// remove myself from reader list
serial_deregister_reader_tx(&r->readers_bitmap);
}
#endif
auto &write_set = GetWriteSet();
for (uint32_t i = 0; i < write_set.size(); ++i) {
auto &w = write_set[i];
dbtuple *tuple = (dbtuple *)w.get_object()->GetPayload();
ASSERT(tuple);
ASSERT(XID::from_ptr(tuple->GetObject()->GetClsn()) == xid);
#if defined(SSI) || defined(SSN) || defined(MVOCC)
if (tuple->NextVolatile()) {
volatile_write(tuple->NextVolatile()->sstamp, NULL_PTR);
#ifdef SSN
tuple->NextVolatile()->welcome_read_mostly_tx();
#endif
}
#endif
Object *obj = w.get_object();
fat_ptr entry = *w.entry;
oidmgr->PrimaryTupleUnlink(w.entry);
obj->SetClsn(NULL_PTR);
ASSERT(obj->GetAllocateEpoch() == xc->begin_epoch);
MM::deallocate(entry);
}
// Read-only tx on a safesnap won't have log
if (log) {
log->discard();
}
}
rc_t transaction::commit() {
ALWAYS_ASSERT(state() == TXN::TXN_ACTIVE);
volatile_write(xc->state, TXN::TXN_COMMITTING);
#if defined(SSN) || defined(SSI)
// Safe snapshot optimization for read-only transactions:
// Use the begin ts as cstamp if it's a read-only transaction
// This is the same for both SSN and SSI.
auto &write_set = GetWriteSet();
if (config::enable_safesnap and (flags & TXN_FLAG_READ_ONLY)) {
ASSERT(not log);
ASSERT(write_set.size() == 0);
xc->end = xc->begin;
volatile_write(xc->state, TXN::TXN_CMMTD);
return {RC_TRUE};
} else {
ASSERT(log);
xc->end = log->pre_commit().offset();
if (xc->end == 0) {
return rc_t{RC_ABORT_INTERNAL};
}
#ifdef SSN
return parallel_ssn_commit();
#elif defined SSI
return parallel_ssi_commit();
#endif
}
#elif defined(MVOCC)
return mvocc_commit();
#else
return si_commit();
#endif
}
#if defined(SSN) || defined(SSI)
#define set_tuple_xstamp(tuple, s) \
{ \
uint64_t x; \
do { \
x = volatile_read(tuple->xstamp); \
} while (x < s and \
not __sync_bool_compare_and_swap(&tuple->xstamp, x, s)); \
}
#endif
#ifdef SSN
rc_t transaction::parallel_ssn_commit() {
auto cstamp = xc->end;
// note that sstamp comes from reads, but the read optimization might
// ignore looking at tuple's sstamp at all, so if tx sstamp is still
// the initial value so far, we need to initialize it as cstamp. (so
// that later we can fill tuple's sstamp as cstamp in case sstamp still
// remained as the initial value.) Consider the extreme case where
// old_version_threshold = 0: means no read set at all...
if (is_read_mostly() && config::ssn_read_opt_enabled()) {
if (xc->sstamp.load(std::memory_order_acquire) == 0)
xc->sstamp.store(cstamp, std::memory_order_release);
} else {
if (xc->sstamp.load(std::memory_order_relaxed) == 0)
xc->sstamp.store(cstamp, std::memory_order_relaxed);
}
// find out my largest predecessor (\eta) and smallest sucessor (\pi)
// for reads, see if sb. has written the tuples - look at sucessor lsn
// for writes, see if sb. has read the tuples - look at access lsn
// Process reads first for a stable sstamp to be used for the
// read-optimization
auto &read_set = GetReadSet();
for (uint32_t i = 0; i < read_set.size(); ++i) {
auto &r = read_set[i];
try_get_successor:
ASSERT(r->GetObject()->GetClsn().asi_type() == fat_ptr::ASI_LOG);
// read tuple->slsn to a local variable before doing anything relying on it,
// it might be changed any time...
fat_ptr successor_clsn = volatile_read(r->sstamp);
if (successor_clsn == NULL_PTR) continue;
if (successor_clsn.asi_type() == fat_ptr::ASI_LOG) {
// overwriter already fully committed/aborted or no overwriter at all
xc->set_sstamp(successor_clsn.offset());
if (not ssn_check_exclusion(xc)) {
return rc_t{RC_ABORT_SERIAL};
}
} else {
// overwriter in progress
ALWAYS_ASSERT(successor_clsn.asi_type() == fat_ptr::ASI_XID);
XID successor_xid = XID::from_ptr(successor_clsn);
TXN::xid_context *successor_xc = TXN::xid_get_context(successor_xid);
if (not successor_xc) {
goto try_get_successor;
}
if (volatile_read(successor_xc->owner) == xc->owner) // myself
continue;
// Must obtain the successor's status first then check ownership
auto successor_state = volatile_read(successor_xc->state);
if (not successor_xc->verify_owner(successor_xid)) {
goto try_get_successor;
}
// Note the race between reading the successor's cstamp and the successor
// setting its cstamp after got one from the log: the successor could
// have got a cstamp but hasn't stored it in its cstamp field, so here
// must rely on the successor's state (set before obtaining cstamp) and
// then spin on successor's cstamp if necessary (state is not
// committing). Directly reading successor's cstamp might miss successors
// that have already got cstamp but hasn't stored it in successor_xc->end
// (esp. dangerous if its cstamp is smaller than mine - could miss
// successor).
if (successor_state == TXN::TXN_ACTIVE) {
// Not yet in pre-commit, skip
continue;
}
// Already in pre-commit or committed, definitely has (or will have)
// cstamp
uint64_t successor_end = 0;
bool should_continue = false;
while (not successor_end) {
// Must in the order of 1. read cstamp, 2. read state, 3. verify owner
auto s = volatile_read(successor_xc->end);
successor_state = volatile_read(successor_xc->state);
if (not successor_xc->verify_owner(successor_xid)) {
goto try_get_successor;
}
if (successor_state == TXN::TXN_ABRTD) {
// If there's a new overwriter, it must have a cstamp larger than mine
should_continue = true;
break;
}
ALWAYS_ASSERT(successor_state == TXN::TXN_CMMTD or
successor_state == TXN::TXN_COMMITTING);
successor_end = s;
}
if (should_continue) {
continue;
}
// overwriter might haven't committed, be commited after me, or before me
// we only care if the successor is committed *before* me.
ALWAYS_ASSERT(successor_end);
ALWAYS_ASSERT(successor_end != cstamp);
if (successor_end > cstamp) {
continue;
}
if (successor_state == TXN::TXN_COMMITTING) {
// When we got successor_end, the successor was committing, use
// successor_end
// if it indeed committed
successor_state = TXN::spin_for_cstamp(successor_xid, successor_xc);
}
// Context change, previous overwriter was gone, retry (should see ASI_LOG
// this time).
if (successor_state == TXN::TXN_INVALID)
goto try_get_successor;
else if (successor_state == TXN::TXN_CMMTD) {
// Again, successor_xc->sstamp might change any time (i.e., successor_xc
// might get reused because successor concludes), so must
// read-then-verify.
auto s = successor_xc->sstamp.load(
(config::ssn_read_opt_enabled() && is_read_mostly())
? std::memory_order_acquire
: std::memory_order_relaxed);
if (not successor_xc->verify_owner(successor_xid)) {
goto try_get_successor;
}
xc->set_sstamp(s);
if (not ssn_check_exclusion(xc)) {
return rc_t{RC_ABORT_SERIAL};
}
}
}
}
auto &write_set = GetWriteSet();
for (uint32_t i = 0; i < write_set.size(); ++i) {
auto &w = write_set[i];
dbtuple *tuple = (dbtuple *)w.get_object()->GetPayload();
// go to the precommitted or committed version I (am about to)
// overwrite for the reader list
dbtuple *overwritten_tuple = tuple->NextVolatile();
ASSERT(not overwritten_tuple or
(tuple->GetObject())->GetNextVolatile().offset() ==
(uint64_t)(overwritten_tuple->GetObject()));
if (not overwritten_tuple) // insert
continue;
ASSERT(XID::from_ptr(volatile_read(overwritten_tuple->sstamp)) == xid);
// Do this before examining the preader field and reading the readers bitmap
overwritten_tuple->lockout_read_mostly_tx();
// Now readers who think this is an old version won't be able to read it
// Then read the readers bitmap - it's guaranteed to cover all possible
// readers (those who think it's an old version) as we
// lockout_read_mostly_tx()
// first. Readers who think this is a young version can still come at any
// time - they will be handled by the orignal SSN machinery.
TXN::readers_bitmap_iterator readers_iter(&overwritten_tuple->readers_bitmap);
while (true) {
int32_t xid_idx = readers_iter.next(true);
if (xid_idx == -1) break;
XID rxid = volatile_read(TXN::rlist.xids[xid_idx]);
ASSERT(rxid != xc->owner);
TXN::xid_context *reader_xc = NULL;
uint64_t reader_end = 0;
auto reader_state = TXN::TXN_ACTIVE;
if (rxid != INVALID_XID) {
reader_xc = TXN::xid_get_context(rxid);
if (reader_xc) {
// Copy everything before doing anything:
// reader_end for getting pstamp;
reader_state = volatile_read(reader_xc->state);
reader_end = volatile_read(reader_xc->end);
}
}
if (rxid == INVALID_XID or not reader_xc or
not reader_xc->verify_owner(rxid)) {
context_change:
// Context change - the guy I saw was already gone, should read tuple
// xstamp
// (i.e., the reader should make sure it has set the xstamp for the
// tuple once
// it deregisters from the bitmap). The new guy that inherits this bit
// position
// will spin on me b/c it'll commit after me, and it will also set
// xstamp after
// spinning on me, so I'll still be reading the xstamp that really
// belongs to
// the older reader - reduces false +ves.)
if (config::ssn_read_opt_enabled() and
overwritten_tuple->has_persistent_reader()) {
// If somebody thought this was an old version, xstamp alone won't be
// accurate,
// should consult both tls read_mostly cstamp and xstamp (we consult
// xstamp in the
// end of the readers loop for each version later in one go), e.g., a
// read-mostly
// tx read it as an old version after a normal tx. Note that we can't
// just set
// pstamp to cstamp-1 because the updater here has no clue what the
// previous
// owner of this bit position did and how its cstamp compares to mine.
uint64_t last_cstamp = TXN::serial_get_last_read_mostly_cstamp(xid_idx);
if (last_cstamp > cstamp) {
// Reader committed without knowing my existence with a larger
// cstamp,
// ie it didn't account me as successor, nothing else to do than
// abort.
return {RC_ABORT_RW_CONFLICT};
}
xc->set_pstamp(last_cstamp);
if (not ssn_check_exclusion(xc)) {
return rc_t{RC_ABORT_SERIAL};
}
} // otherwise we will catch the tuple's xstamp outside the loop
} else {
// We have a valid context, now see if we should get reader's commit ts.
if (reader_state != TXN::TXN_ACTIVE and not reader_end) {
while (not reader_end) {
auto r = volatile_read(reader_xc->end);
reader_state = volatile_read(reader_xc->state);
if (not reader_xc->verify_owner(rxid)) {
goto context_change;
}
if (reader_state == TXN::TXN_ABRTD) {
reader_end = 0;
break;
}
reader_end = r;
}
}
if (reader_state == TXN::TXN_ACTIVE or not reader_end or
reader_end > cstamp) {
// Not in pre-commit yet or will (attempt to) commit after me or
// aborted,
// don't care... unless it's considered an old version by some reader.
// Still there's a chance to set its sstamp so that the reader will
// (implicitly) know my existence.
if (config::ssn_read_opt_enabled() and
overwritten_tuple->has_persistent_reader()) {
// Only read-mostly transactions will mark the persistent_reader
// bit;
// if reader_xc isn't read-mostly, then it's definitely not him,
// consult last_read_mostly_cstamp.
// Need to account for previously committed read-mostly txs anyway
uint64_t last_cstamp = TXN::serial_get_last_read_mostly_cstamp(xid_idx);
if (reader_xc->xct->is_read_mostly() and
not reader_xc->set_sstamp(
(~TXN::xid_context::sstamp_final_mark) &
xc->sstamp.load(std::memory_order_acquire))) {
// Failed setting the tx's sstamp - it must have finalized sstamp,
// i.e., entered precommit, so it must have a valid cstamp.
if (reader_end == 0) {
reader_end = volatile_read(reader_xc->end);
}
if (reader_xc->verify_owner(rxid)) {
ALWAYS_ASSERT(reader_end);
while (last_cstamp < reader_end) {
// Wait until the tx sets last_cstamp or aborts
last_cstamp = TXN::serial_get_last_read_mostly_cstamp(xid_idx);
if (volatile_read(reader_xc->state) == TXN::TXN_ABRTD or
!reader_xc->verify_owner(rxid)) {
last_cstamp = TXN::serial_get_last_read_mostly_cstamp(xid_idx);
break;
}
}
} else {
// context change - the tx must have already updated its
// last_cstamp if committed.
last_cstamp = TXN::serial_get_last_read_mostly_cstamp(xid_idx);
}
if (last_cstamp > cstamp) {
// committed without knowing me
return {RC_ABORT_RW_CONFLICT};
}
} // else it must be another transaction is using this context or
// we succeeded setting the read-mostly tx's sstamp
xc->set_pstamp(last_cstamp);
if (not ssn_check_exclusion(xc)) {
return rc_t{RC_ABORT_SERIAL};
}
}
} else {
ALWAYS_ASSERT(reader_end and reader_end < cstamp);
if (config::ssn_read_opt_enabled() and
overwritten_tuple->has_persistent_reader()) {
// Some reader who thinks this tuple is old existed, in case
// reader_xc
// is read-mostly, spin on it and consult the read_mostly_cstamp.
// Note
// still we need to refresh and read xstamp outside the loop in case
// this
// isn't the real reader that I should care.
if (reader_xc->xct->is_read_mostly()) {
TXN::spin_for_cstamp(rxid, reader_xc);
}
xc->set_pstamp(TXN::serial_get_last_read_mostly_cstamp(xid_idx));
if (not ssn_check_exclusion(xc)) {
return rc_t{RC_ABORT_SERIAL};
}
} else {
// (Pre-) committed before me, need to wait for its xstamp to
// finalize.
if (TXN::spin_for_cstamp(rxid, reader_xc) == TXN::TXN_CMMTD) {
xc->set_pstamp(reader_end);
if (not ssn_check_exclusion(xc)) {
return rc_t{RC_ABORT_SERIAL};
}
}
// else aborted or context change during the spin, no clue if the
// reader
// committed - read the xstamp in case it did committ (xstamp is
// stable
// now, b/c it'll only deregister from the bitmap after setting
// xstamp,
// and a context change means the reader must've concluded - either
// aborted or committed - and so deregistered from the bitmap). We
// do this
// outside the loop in one go.
}
}
}
}
// Still need to re-read xstamp in case we missed any reader
xc->set_pstamp(volatile_read(overwritten_tuple->xstamp));
if (not ssn_check_exclusion(xc)) {
return rc_t{RC_ABORT_SERIAL};
}
}
if (config::ssn_read_opt_enabled() and is_read_mostly()) {
xc->finalize_sstamp();
}
if (not ssn_check_exclusion(xc)) return rc_t{RC_ABORT_SERIAL};
if (config::phantom_prot && !MasstreeCheckPhantom()) {
return rc_t{RC_ABORT_PHANTOM};
}
// ok, can really commit if we reach here
log->commit(NULL);
// Do this before setting TXN_CMMTD state so that it'll be stable
// no matter the guy spinning on me noticed a context change or
// my real state (CMMTD or ABRTD)
// XXX: one optimization might be setting this only when read some
// old versions.
if (config::ssn_read_opt_enabled() and is_read_mostly())
TXN::serial_stamp_last_committed_lsn(xc->end);
uint64_t my_sstamp = 0;
if (config::ssn_read_opt_enabled() and is_read_mostly()) {
my_sstamp = xc->sstamp.load(std::memory_order_acquire) &
(~TXN::xid_context::sstamp_final_mark);
} else {
my_sstamp = xc->sstamp.load(std::memory_order_relaxed);
}
ALWAYS_ASSERT(my_sstamp and
(my_sstamp & TXN::xid_context::sstamp_final_mark) == 0);
// post-commit: stuff access stamps for reads; init new versions
auto clsn = xc->end;
for (uint32_t i = 0; i < write_set.size(); ++i) {
auto &w = write_set[i];
Object *object = w.get_object();
dbtuple *tuple = (dbtuple *)object->GetPayload();
tuple->DoWrite();
dbtuple *next_tuple = tuple->NextVolatile();
ASSERT(not next_tuple or (object->GetNextVolatile().offset() ==
(uint64_t)next_tuple->GetObject()));
if (next_tuple) { // update, not insert
ASSERT(next_tuple->GetObject()->GetClsn().asi_type());
ASSERT(XID::from_ptr(next_tuple->sstamp) == xid);
volatile_write(next_tuple->sstamp, LSN::make(my_sstamp, 0).to_log_ptr());
ASSERT(next_tuple->sstamp.asi_type() == fat_ptr::ASI_LOG);
next_tuple->welcome_read_mostly_tx();
}
volatile_write(tuple->xstamp, cstamp);
fat_ptr clsn_ptr = object->GenerateClsnPtr(clsn);
object->SetClsn(clsn_ptr);
ASSERT(tuple->GetObject()->GetClsn().asi_type() == fat_ptr::ASI_LOG);
}
// This state change means:
// 1. New data generated by me are available to be read
// (need to this after finished post-commit for write set)
// 2. My cstamp is valid and stable, can be used by
// conflicting readers as their sstamp or by conflicting
// writers as their pstamp.
COMPILER_MEMORY_FENCE;
volatile_write(xc->state, TXN::TXN_CMMTD);
// The availability of xtamp, solely depends on when
// serial_deregister_reader_tx is called. So the point
// here is to do the following strictly one by one in order:
// 1. Spin on older successor
// 2. Set xstamp
// 3. Deregister from bitmap
// Without 1, the updater might see a larger-than-it-should
// xstamp and use it as its pstamp -> more unnecessary aborts
for (uint32_t i = 0; i < read_set.size(); ++i) {
auto &r = read_set[i];
ASSERT(r->GetObject()->GetClsn().asi_type() == fat_ptr::ASI_LOG);
// Spin to hold this position until the older successor is gone,
// so the updater can get a resonable xstamp (not too high)
auto sstamp = volatile_read(r->sstamp);
if (sstamp.asi_type() == fat_ptr::ASI_XID) {
XID oxid = XID::from_ptr(sstamp);
if (oxid != this->xid) { // exclude myself
TXN::xid_context *ox = TXN::xid_get_context(oxid);
if (ox) {
auto ox_end = volatile_read(ox->end);
auto ox_owner = volatile_read(ox->owner);
if (ox_owner == oxid and ox_end and ox_end < cstamp)
TXN::spin_for_cstamp(oxid, ox);
}
// if !ox or ox_owner != oxid then the guy is
// already gone, don't bother
}
}
// Now set the access stamp - do this after the above spin so
// the writer will read the xstamp that was really set by the
// preceeding reader, instead of some younger reader that will
// commit after it (like me).
set_tuple_xstamp(r, cstamp);
// Must deregister from the bitmap **after** set xstamp, so that
// the updater will be able to see the correct xstamp after noticed
// a context change; otherwise it might miss it and read a too-old
// xstamp that was set by some earlier reader.
serial_deregister_reader_tx(&r->readers_bitmap);
}
return rc_t{RC_TRUE};
}
#elif defined(SSI)
rc_t transaction::parallel_ssi_commit() {
// tzwang: The race between the updater (if any) and me (as the reader) -
// A reader publishes its existence by calling serial_register_reader().
// The updater might not notice this if it already checked this tuple
// before the reader published its existence. In this case, the writer
// actually thinks nobody is reading the version that's being overwritten.
// So as the reader, we need to make sure we know the existence of the
// updater during precommit. We do this by checking the tuple's sstamp
// (and it's basically how this parallel pre-commit works).
//
// Side note: this "writer-oversights-reader" problem pertains to only
// SSI, because it needs to track "concurrent reads" or some read that
// belongs to an evil T1 (assuming I'm the unlucky T2), while for SSN
// it doesn't matter - SSN only cares about things got committed before,
// and the reader that registers after writer entered precommit will
// definitely precommit after the writer.
//
// Reader's protocol:
// 1. If sstamp is not set: no updater yet, but if later an updater comes,
// it will enter precommit after me (if it can survive). So by the time
// the updater started to look at the readers bitmap, if I'm still in
// precommit, I need to make sure it sees me on the bitmap (trivial, as
// I won't change any bitmaps after enter precommit); or if I committed,
// I need to make sure it sees my updated xstamp. This essentially means
// the reader shouldn't deregister from the bitmap before setting xstamp.
//
// 2. If sstamp is ASI_LOG: the easy case, updater already committed, do
// usually SSI checks.
//
// 3. If sstamp is ASI_XID: the most complicated case, updater still active.
// a) If the updater has a cstamp < my cstamp: it will commit before
// me, so reader should spin on it and find out the final sstamp
// result, then do usual SSI checks.
//
// b) If the updater has a cstamp > my cstamp: the writer entered
// precommit after I did, so it definitely knows my existence -
// b/c when it entered precommit, I won't change any bitmap's
// bits (b/c I'm already in precommit). The updater should check
// the reader's cstamp (which will be the version's xstamp), i.e.,
// the updater will spin if if finds the reader is in pre-commit;
// otherwise it will read the xstamp directly from the version.
//
// c) If the updater doesn't have a cstamp: similar to 1 above),
// it will know about my existence after entered precommit. The
// rest is the same - reader has the responsibility to make sure
// xstamp or bitmap reflect its visibility.
//
// d) If the updater has a cstamp < my cstamp: I need to spin until
// the updater has left to hold my position in the bitmap, so
// that the updater can know that I'm a concurrent reader. This
// can be done in post-commit, right before I have to pull myself
// out from the bitmap.
//
// The writer then doesn't have much burden, it just needs to take a look
// at the readers bitmap and abort if any reader is still active or any
// xstamp > ct3; overwritten versions' xstamp are guaranteed to be valid
// because the reader won't remove itself from the bitmap unless it updated
// v.xstamp.
auto cstamp = xc->end;
// get the smallest s1 in each tuple we have read (ie, the smallest cstamp
// of T3 in the dangerous structure that clobbered our read)
uint64_t ct3 = xc->ct3; // this will be the s2 of versions I clobbered
auto &read_set = GetReadSet();
for (uint32_t i = 0; i < read_set.size(); ++i) {
auto &r = read_set[i];
get_overwriter:
fat_ptr overwriter_clsn = volatile_read(r->sstamp);
if (overwriter_clsn == NULL_PTR) continue;
uint64_t tuple_s1 = 0;
if (overwriter_clsn.asi_type() == fat_ptr::ASI_LOG) {
// already committed, read tuple's sstamp
ALWAYS_ASSERT(overwriter_clsn.asi_type() == fat_ptr::ASI_LOG);
tuple_s1 = volatile_read(overwriter_clsn).offset();
} else {
ALWAYS_ASSERT(overwriter_clsn.asi_type() == fat_ptr::ASI_XID);
XID ox = XID::from_ptr(overwriter_clsn);
if (ox == xc->owner) // myself
continue;
ASSERT(ox != xc->owner);
TXN::xid_context *overwriter_xc = TXN::xid_get_context(ox);
if (not overwriter_xc) goto get_overwriter;
// A race exists between me and the overwriter (similar to that in SSN):
// a transaction must transition to COMMITTING state before setting its
// cstamp, and I must read overwriter's state first, before looking at
// its cstamp; otherwise if I look at overwriter's cstamp directly, I
// might miss overwriterwho actually have a smaller cstamp - obtaining
// a cstamp and storing it in xc->end are not done atomically in a single
// instruction. So here start with reading overwriter's state.
// Must obtain the overwriter's status first then check ownership
auto overwriter_state = volatile_read(overwriter_xc->state);
if (not overwriter_xc->verify_owner(ox)) {
goto get_overwriter;
}
if (overwriter_state == TXN::TXN_ACTIVE) {
// successor really still hasn't entered pre-commit, skip
continue;
}
uint64_t overwriter_end = 0;
bool should_continue = false;
while (overwriter_end == 0) {
auto s = volatile_read(overwriter_xc->end);
overwriter_state = volatile_read(overwriter_xc->state);
if (not overwriter_xc->verify_owner(ox)) {
goto get_overwriter;
}
if (overwriter_state == TXN::TXN_ABRTD) {
should_continue = true;
break;
}
overwriter_end = s;
}
if (should_continue) {
continue;
}
ALWAYS_ASSERT(overwriter_end);
ALWAYS_ASSERT(overwriter_end != cstamp);
if (overwriter_end > cstamp) {
continue;
}
// Spin if the overwriter entered precommit before me: need to
// find out the final sstamp value.
// => the updater maybe doesn't know my existence and commit.
if (overwriter_state == TXN::TXN_COMMITTING) {
overwriter_state = TXN::spin_for_cstamp(ox, overwriter_xc);
}
if (overwriter_state == TXN::TXN_INVALID) // context change, retry
goto get_overwriter;
else if (overwriter_state == TXN::TXN_CMMTD)
tuple_s1 = overwriter_end;
}
if (tuple_s1 and (not ct3 or ct3 > tuple_s1)) ct3 = tuple_s1;
// Now the updater (if exists) should've already concluded and stamped
// s2 - requires updater to change state to CMMTD only after setting
// all s2 values.
COMPILER_MEMORY_FENCE;
if (volatile_read(r->s2)) return {RC_ABORT_SERIAL};
// Release read lock (readers bitmap) after setting xstamp in post-commit
}
auto &write_set = GetWriteSet();
if (ct3) {
// now see if I'm the unlucky T2
for (uint32_t i = 0; i < write_set.size(); ++i) {
auto &w = write_set[i];
dbtuple *overwritten_tuple =
w.get_object()->GetPinnedTuple()->NextVolatile();
if (not overwritten_tuple) continue;
// Note: the bits representing readers that will commit **after**
// me are stable; those representing readers older than my cstamp
// could go away any time. But I can look at the version's xstamp
// in that case. So the reader should make sure when it goes away
// from the bitmap, xstamp is ready to be read by the updater.
TXN::readers_bitmap_iterator readers_iter(&overwritten_tuple->readers_bitmap);
while (true) {
int32_t xid_idx = readers_iter.next(true);
if (xid_idx == -1) break;
XID rxid = volatile_read(TXN::rlist.xids[xid_idx]);
ASSERT(rxid != xc->owner);
if (rxid == INVALID_XID) continue;
uint64_t reader_end = 0;
auto reader_state = TXN::TXN_ACTIVE;
TXN::xid_context *reader_xc = NULL;
if (rxid._val) {
reader_xc = TXN::xid_get_context(rxid);
if (reader_xc) {
// copy everything before doing anything
reader_state = volatile_read(reader_xc->state);
reader_end = volatile_read(reader_xc->end);
}
}
if (not rxid._val or not reader_xc or
not reader_xc->verify_owner(rxid)) {
context_change:
// Context change: The guy I saw on the bitmap is gone - it
// must had a cstamp older than mine (otherwise it couldn't
// go away before me), or given up even before entered pre-
// commit. So I should read the xstamp in case it did commit.
//
// No need to worry about the new guy who occupied this bit:
// it'll spin on me if I'm still after pre-commit to maintain
// its position in the bitmap.
auto tuple_xstamp = volatile_read(overwritten_tuple->xstamp);
// xstamp might still be 0 - if the reader aborted
if (tuple_xstamp >= ct3) return {RC_ABORT_SERIAL};
} else {
bool should_continue = false;
if (reader_state != TXN::TXN_ACTIVE and not reader_end) {
while (not reader_end) {
auto r = volatile_read(reader_xc->end);
reader_state = volatile_read(reader_xc->state);
if (not reader_xc->verify_owner(rxid)) {
goto context_change;
}
if (reader_state == TXN::TXN_ABRTD) {
should_continue = true;
break;
}
reader_end = r;
}
}
if (should_continue) { // reader aborted
continue;
}
if (reader_state == TXN::TXN_ACTIVE or not reader_end) {
// Reader not in precommit yet, and not sure if it is likely
// to commit. But aborting the pivot is easier here, so we
// just abort (betting the reader is likely to succeed).
//
// Another way is don't do anything here, reader will notice
// my presence or my legacy (sstamp on the version) once
// entered precommit; this might cause deadlock tho.
return {RC_ABORT_SERIAL};
} else if (reader_end < cstamp) {
ASSERT(ct3);
// This reader_end will be the version's xstamp if it committed
if (reader_end >= ct3) {
auto cr = TXN::spin_for_cstamp(rxid, reader_xc);
if (cr == TXN::TXN_CMMTD)
return {RC_ABORT_SERIAL};
else if (cr == TXN::TXN_INVALID) {
// Context change, no clue if the reader committed
// or aborted, read tuple's xstamp - the same
// reasoning as in SSN
if (volatile_read(overwritten_tuple->xstamp) >= ct3)
return {RC_ABORT_SERIAL};
}
} // otherwise it's worth the spin - won't use it anyway
} else {
// Reader will commit after me, ie its cstamp will be > ct3
// (b/c ct3 is the first committed in the dangerous structure)
// and will not release its seat in the bitmap until I'm gone
return {RC_ABORT_SERIAL};
}
}
}
// Check xstamp in case in-flight readers left or there was no reader
if (volatile_read(overwritten_tuple->xstamp) >= ct3)
return {RC_ABORT_SERIAL};
}
}
if (config::phantom_prot && !MasstreeCheckPhantom()) {
return rc_t{RC_ABORT_PHANTOM};
}
// survived!
log->commit(NULL);
// stamp overwritten versions, stuff clsn
auto clsn = xc->end;
for (uint32_t i = 0; i < write_set.size(); ++i) {
auto &w = write_set[i];
Object *object = w.get_object();
dbtuple *tuple = (dbtuple *)object->GetPayload();
tuple->DoWrite();
dbtuple *overwritten_tuple = tuple->NextVolatile();
fat_ptr clsn_ptr = object->GenerateClsnPtr(clsn);
if (overwritten_tuple) { // update
ASSERT(not overwritten_tuple->s2);
// Must set sstamp first before setting s2 (ssi_read assumes sstamp is
// available once s2 is available)
ASSERT(XID::from_ptr(volatile_read(overwritten_tuple->sstamp)) == xid);
volatile_write(overwritten_tuple->sstamp, clsn_ptr);
// Must set s2 first, before setting clsn
volatile_write(overwritten_tuple->s2, ct3);
COMPILER_MEMORY_FENCE;
}
volatile_write(tuple->xstamp, cstamp);
object->SetClsn(clsn_ptr);
ASSERT(tuple->GetObject()->GetClsn().asi_type() == fat_ptr::ASI_LOG);
}
// NOTE: make sure this happens after populating log block,
// otherwise readers will see inconsistent data!
// This is where (committed) tuple data are made visible to readers
//
// This needs to happen after setting overwritten tuples' s2, b/c
// the reader needs to check this during pre-commit.
COMPILER_MEMORY_FENCE;
volatile_write(xc->state, TXN::TXN_CMMTD);
// Similar to SSN implementation, xstamp's availability depends solely
// on when to deregister_reader_tx, not when to transitioning to the
// "committed" state.
for (uint32_t i = 0; i < read_set.size(); ++i) {
auto &r = read_set[i];
// Update xstamps in read versions, this should happen before
// deregistering from the bitmap, so when the updater found a
// context change, it'll get a stable xtamp from the tuple.
// No need to look into write set and skip: DoTupleRead will
// skip inserting to read set if it's already in write set; it's
// possible to see a tuple in both read and write sets, only if
// the tuple is first read, then updated - updating the xstamp
// of such a tuple won't hurt, and it eliminates unnecessary
// cycles spent on hashtable.
set_tuple_xstamp(r, cstamp);
// Now wait for the updater that pre-committed before me to go
// so effectively means I'm holding my position on in the bitmap
// and preventing it from being reused by another reader before
// the overwriter leaves. So the overwriter will be able to see
// a stable readers bitmap and tell if there's an active reader
// and if so then whether it has to abort because it found itself
// being an unlucky T2.
auto sstamp = volatile_read(r->sstamp);
if (sstamp.asi_type() == fat_ptr::ASI_XID) {
XID oxid = XID::from_ptr(sstamp);
if (oxid != this->xid) { // exclude myself
TXN::xid_context *ox = TXN::xid_get_context(oxid);
if (ox) {
auto ox_end = volatile_read(ox->end);
if (ox->verify_owner(oxid) and ox_end and ox_end < cstamp)
TXN::spin_for_cstamp(oxid, ox);
}
// if !ox or ox_owner != oxid then the guy is
// already gone, don't bother
}
}
COMPILER_MEMORY_FENCE;
// now it's safe to release my seat!
// Need to do this after setting xstamp, so that the updater can
// see the xstamp if it didn't find the bit in the bitmap is set.
serial_deregister_reader_tx(&r->readers_bitmap);
}
return rc_t{RC_TRUE};
}
#elif defined(MVOCC)
rc_t transaction::mvocc_commit() {
if (!(flags & TXN_FLAG_CMD_REDO) && config::is_backup_srv()) {
return rc_t{RC_TRUE};
}
ASSERT(log);
// get clsn, abort if failed
xc->end = log->pre_commit().offset();
if (xc->end == 0) {
return rc_t{RC_ABORT_INTERNAL};
}
if (config::phantom_prot && !MasstreeCheckPhantom()) {
return rc_t{RC_ABORT_PHANTOM};
}