ACTF 2025 WriteUp (Reverse方向)

7.5k 词

re出题人怎么这么喜欢矩阵,FPGA和deeptx都属于恢复出来逻辑不好逆的

ezFPGA:

查了查FPGA的相关知识,先用Digital-IDE查看vcd信号:

提取密文:AD00C09F1617EC25251F12E27F9F375312BA8D3860141B318E13E2560A1A25B980738A60

从encryptor恢复加密逻辑:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
module Encryptor#(
parameter logic [7:0] FLAG [0:13] = {
"A", "C", "T", "F", "{", "t", "e", "s", "t", "f", "l", "a", "g", "}"
}
)(
input rst,
input clk,
output logic [7:0] cypher
);

typedef logic [7:0] uint8_t;
localparam l = $size(FLAG);

uint8_t aa [38:0];

genvar i;
generate
for(i = 0; i < l; i++) begin : gen
assign aa[i] = FLAG[i];
end
for (i = l; i < 39; i++) begin : gen
assign aa[i] = 0;
end
endgenerate

uint8_t ab [0:3] = {11,4,5,14};
uint8_t ac [35:0];

generate
for(i = 0; i < 36; i++) begin : gen
assign ac[i] = aa[i]*ab[0] + aa[i+1]*ab[1] + aa[i+2]*ab[2] + aa[i+3]*ab[3];
end
endgenerate

uint8_t ad [0:35] = {116,174,193,124,102,100,11,193,115,4,127,139,98,214,197,145,97,151,31,30,117,15,230,179,235,25,244,202,73,222,15,191,119,140,94,32};

uint8_t ae [35:0];

generate
for (i = 0; i < 36; i = i + 1) begin
assign ae[i] = ac[i/6*6]*ad[i%6]+ac[i/6*6+1]*ad[i%6+6]+ac[i/6*6+2]*ad[i%6+12]+ac[i/6*6+3]*ad[i%6+18]+ac[i/6*6+4]*ad[i%6+24]+ac[i/6*6+5]*ad[i%6+30];
end
endgenerate

uint8_t af[35:0];
uint8_t ba[255:0];
uint8_t ca,cb,cd,ce,cf,cg,ch;
uint8_t da;
uint8_t db[0:7] = {"e","c","l","i","p","s","k","y"};
typedef enum logic[1:0] {S0,S1,S2,S3} state_t;
state_t state;

assign cd = ca + 1;
assign ce = cb + ba[cd];
assign cf = ba[cd] + ba[ce];
assign ch = cg + ba[da] + db[da%8];

always_ff @( posedge clk or posedge rst) begin
if (rst) begin
ca <= 0;
cb <= 0;
cg <= 0;
da <= 0;
cypher <= 0;
state <= S1;
end else begin
case (state)
S0: begin
if (da != 8'd255) begin
ba[da] <= da;
da <= da + 1;
end else begin
ba[da] <= da;
da <= 0;
state <= S1;
end
end
S1: begin
if (da != 8'd255) begin
ba[da] <= ba[ch];
ba[ch] <= ba[da];
cg <= ch;
da <= da + 1;
end else begin
ba[da] <= ba[ch];
ba[ch] <= ba[da];
da <= 0;
state <= S2;
end
end
S2: begin
if (da < 36) begin
ba[cd] <= ba[ce];
ba[ce] <= ba[cd];
af[da[5:0]] <= ba[cf] + ae[da[5:0]];
ca <= cd;
cb <= ce;
da <= da + 1;
end else begin
da <= 0;
state <= S3;
end
end
S3: begin
if (da < 36) begin
cypher <= af[da[5:0]];
da <= da + 1;
end else begin
cypher <= 0;
end
end
endcase
end
end

endmodule

python实现:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
m = bytearray(b"ACTF{?????????????????????????????????}")
c1 = list(range(36))
c2 = list(range(36))
c3 = list(range(36))

for i in range(36):
c1[i] = (m[i] * 11 + m[i+1] * 4 + m[i+2] * 5 + m[i+3] * 14) & 0xFF

print(c1)

k2 = [116, 174, 193, 124, 102, 100,
11, 193, 115, 4, 127, 139,
98, 214, 197, 145, 97, 151,
31, 30, 117, 15, 230, 179,
235, 25, 244, 202, 73, 222,
15, 191, 119, 140, 94, 32]

for i in range(36):
ttl = 0
for j in range(6):
ttl += c1[i // 6 * 6 + j] * k2[i % 6 + 6 * j]
c2[i] = ttl & 0xFF

print(c2)

k3 = bytearray(b"eclipsky")
sbox = list(range(256))
j = 0
for i in range(256):
j = (j + sbox[i] + k3[i % 8]) & 0xFF
sbox[i], sbox[j] = sbox[j], sbox[i]
i = 0
j = 0
for p in range(36):
i = (i + 1) % 256
j = (j + sbox[i]) % 256
sbox[i], sbox[j] = sbox[j], sbox[i]
c3[p] = sbox[(sbox[i] + sbox[j]) % 256] + c2[p]

print(c3)

# c3 = bytearray.fromhex(b"AD00C09F1617EC25251F12E27F9F375312BA8D3860141B318E13E2560A1A25B980738A60")

编写脚本解密:

RC4阶段:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
enc = bytearray.fromhex("AD00C09F1617EC25251F12E27F9F375312BA8D3860141B318E13E2560A1A25B980738A60")
de3 = bytearray(36)

k3 = bytearray(b"eclipsky")
sbox = list(range(256))
j = 0
for i in range(256):
j = (j + sbox[i] + k3[i % 8]) & 0xFF
sbox[i], sbox[j] = sbox[j], sbox[i]
i = 0
j = 0
for p in range(36):
i = (i + 1) % 256
j = (j + sbox[i]) % 256
sbox[i], sbox[j] = sbox[j], sbox[i]
de3[p] = (enc[p] - sbox[(sbox[i] + sbox[j]) % 256]) & 0xFF
print(de3.hex())

得到c2:6b01bf6dbc164991f3eb4c714ef0c0c546f87d51523d8b8e4ed8a2b233e3f4875287c322

矩阵阶段:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from z3 import *

k2 = [116, 174, 193, 124, 102, 100,
11, 193, 115, 4, 127, 139,
98, 214, 197, 145, 97, 151,
31, 30, 117, 15, 230, 179,
235, 25, 244, 202, 73, 222,
15, 191, 119, 140, 94, 32]

c2_hex = "6b01bf6dbc164991f3eb4c714ef0c0c546f87d51523d8b8e4ed8a2b233e3f4875287c322"
c2 = [int(c2_hex[i:i+2], 16) for i in range(0, len(c2_hex), 2)]
solver = Solver()
c1 = [BitVec(f'c1_{i}', 8) for i in range(36)]
for i in range(36):
ttl = 0
for j in range(6):
ttl += c1[i // 6 * 6 + j] * k2[i % 6 + 6 * j]
solver.add(ttl & 0xFF == c2[i])
if solver.check() == sat:
model = solver.model()
c1_solution = [model[c1[i]].as_long() for i in range(36)]
print("c1 =", c1_solution)
else:
print("无解")

得到c1:[79, 73, 151, 50, 184, 200, 100, 192, 131, 26, 249, 134, 159, 51, 3, 110, 114, 221, 227, 53, 217, 14, 87, 183, 172, 111, 194, 28, 122, 221, 22, 117, 218, 95, 0, 0]

加权求和阶段:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
from z3 import *

c1 = [79, 73, 151, 50, 184, 200, 100, 192, 131, 26, 249, 134,
159, 51, 3, 110, 114, 221, 227, 53, 217, 14, 87, 183,
172, 111, 194, 28, 122, 221, 22, 117, 218, 95, 0, 0]

solver = Solver()
m = [BitVec(f'm_{i}', 8) for i in range(39)]
for i in range(36):
solver.add((m[i] * 11 + m[i+1] * 4 + m[i+2] * 5 + m[i+3] * 14) & 0xFF == c1[i])

if solver.check() == sat:
model = solver.model()
m_solution = [model[m[i]].as_long() for i in range(39)]
print("m =", m_solution)
else:
print("无解")

得到flag:[65, 67, 84, 70, 123, 82, 67, 52, 95, 52, 110, 100, 95, 70, 80, 71, 65, 95, 119, 52, 108, 107, 95, 49, 110, 116, 48, 95, 52, 95, 98, 52, 114, 125, 0, 0, 0, 0, 0]

flag:ACTF{RC4_4nd_FPGA_w4lk_1nt0_4_b4r}.

deeptx:

cuda逆向,查看main:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
int __fastcall main(int argc, const char **argv, const char **envp)
{
int v3; // ebx
_QWORD v5[64]; // [rsp+0h] [rbp-8C0h] BYREF
unsigned __int8 *output_ptr; // [rsp+200h] [rbp-6C0h] BYREF
unsigned __int8 *pixel_ptr; // [rsp+208h] [rbp-6B8h] BYREF
char ptr__1[1024]; // [rsp+210h] [rbp-6B0h] BYREF
char ptr_[4]; // [rsp+610h] [rbp-2B0h] BYREF
int n256_1; // [rsp+614h] [rbp-2ACh]
int n256; // [rsp+618h] [rbp-2A8h]
__int16 n8; // [rsp+61Eh] [rbp-2A2h]
int v13; // [rsp+620h] [rbp-2A0h]
char ptr[14]; // [rsp+642h] [rbp-27Eh] BYREF
_QWORD v15[32]; // [rsp+650h] [rbp-270h] BYREF
__int64 v16; // [rsp+750h] [rbp-170h] BYREF
__int64 v17; // [rsp+858h] [rbp-68h] BYREF
unsigned int v18; // [rsp+860h] [rbp-60h]
__int64 v19; // [rsp+864h] [rbp-5Ch] BYREF
unsigned int v20; // [rsp+86Ch] [rbp-54h]
__int64 v21; // [rsp+870h] [rbp-50h] BYREF
unsigned int v22; // [rsp+878h] [rbp-48h]
__int64 v23; // [rsp+87Ch] [rbp-44h] BYREF
unsigned int v24; // [rsp+884h] [rbp-3Ch]
__int64 v25; // [rsp+888h] [rbp-38h] BYREF
unsigned int v26; // [rsp+890h] [rbp-30h]
__int64 v27; // [rsp+894h] [rbp-2Ch] BYREF
unsigned int v28; // [rsp+89Ch] [rbp-24h]
char *ptr_3; // [rsp+8A0h] [rbp-20h]
char *pixel; // [rsp+8A8h] [rbp-18h]

std::ifstream::basic_ifstream(file, "flag.bmp", 4LL);
if ( (unsigned __int8)std::ios::operator!(&v16) )
{
v3 = -1;
}
else
{
std::istream::read((std::istream *)file, ptr, 14LL);
std::istream::read((std::istream *)file, ptr_, 40LL);
if ( n256_1 == 256 )
{
if ( n256 == 256 )
{
if ( n8 == 8 )
{
if ( v13 )
{
v3 = -1;
}
else
{
std::istream::read((std::istream *)file, ptr__1, 1024LL);
pixel = (char *)malloc(0x10000uLL);
ptr_3 = (char *)malloc(0x10000uLL);
cudaMemcpyToSymbol<unsigned char [256]>(&cuda_sbox, sbox, 256LL, 0LL, 1LL);
cudaMemcpyToSymbol<unsigned char [256]>(&cuda_tbox, tbox, 256LL, 0LL, 1LL);
cudaMemcpyToSymbol<float [256]>((__int64)&cuda_motion, (__int64)motion, 1024LL, 0LL, 1u);
cudaMalloc<unsigned char>(&pixel_ptr, 0x10000LL);
cudaMalloc<unsigned char>(&output_ptr, 0x10000LL);
std::istream::read((std::istream *)file, pixel, 0x10000LL);
std::ifstream::close(file);
cudaMemcpy(pixel_ptr, pixel, 0x10000LL, 1LL);
dim3::dim3((dim3 *)&v17, 256, 1, 1);
dim3::dim3((dim3 *)&v19, 256, 1, 1);
if ( !(unsigned int)_cudaPushCallConfiguration(v19, v20, v17, v18, 0LL, 0LL) )
Layer1(pixel_ptr, output_ptr);
cudaDeviceSynchronize();
dim3::dim3((dim3 *)&v21, 256, 1, 1);
dim3::dim3((dim3 *)&v23, 256, 1, 1);
if ( !(unsigned int)_cudaPushCallConfiguration(v23, v24, v21, v22, 0LL, 0LL) )
Layer2(output_ptr, pixel_ptr);
cudaDeviceSynchronize();
dim3::dim3((dim3 *)&v25, 256, 1, 1);
dim3::dim3((dim3 *)&v27, 256, 1, 1);
if ( !(unsigned int)_cudaPushCallConfiguration(v27, v28, v25, v26, 0LL, 0LL) )
Layer3(pixel_ptr, output_ptr);
cudaDeviceSynchronize();
cudaMemcpy(ptr_3, output_ptr, 0x10000LL, 2LL);
std::ofstream::basic_ofstream(v5, "deep_flag.bmp", 4LL);
std::ostream::write((std::ostream *)v5, ptr, 14LL);
std::ostream::write((std::ostream *)v5, ptr_, 40LL);
std::ostream::write((std::ostream *)v5, ptr__1, 1024LL);
std::ostream::write((std::ostream *)v5, ptr_3, 0x10000LL);
std::ofstream::close(v5);
free(pixel);
free(ptr_3);
cudaFree(pixel_ptr);
cudaFree(output_ptr);
v3 = 0;
std::ofstream::~ofstream(v5);
}
}
else
{
v3 = -1;
}
}
else
{
v3 = -1;
}
}
else
{
v3 = -1;
}
}
std::ifstream::~ifstream(v15);
return v3;
}

读取了一个flag.bmp,bmp是纯黑白的,没有rgb颜色,一个像素对应一个0~FF的值,被加密的是bmp中的256*256=65536个像素,用layer1、2、3加密了这65536个字节,cuobjdump得到ptx:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
Fatbin elf code:
================
arch = sm_86
code version = [1,7]
host = linux
compile_size = 64bit

Fatbin elf code:
================
arch = sm_86
code version = [1,7]
host = linux
compile_size = 64bit

Fatbin ptx code:
================
arch = sm_86
code version = [8,7]
host = linux
compile_size = 64bit
compressed
ptxasOptions =

//
//
//
//
//
//

.version 8.7
.target sm_86
.address_size 64

//
.const .align 1 .b8 cuda_sbox[256];
.const .align 1 .b8 cuda_tbox[256];
.const .align 4 .b8 cuda_motion[1024];

.visible .entry _Z6Layer1PhS_(
.param .u64 _Z6Layer1PhS__param_0,
.param .u64 _Z6Layer1PhS__param_1
)
{
.reg .pred %p<6>;
.reg .b16 %rs<2>;
.reg .f32 %f<12>;
.reg .b32 %r<23>;
.reg .b64 %rd<15>;


ld.param.u64 %rd5, [_Z6Layer1PhS__param_0];
ld.param.u64 %rd6, [_Z6Layer1PhS__param_1];
mov.u32 %r1, %tid.x;
setp.lt.u32 %p1, %r1, 241;
mov.u32 %r2, %ctaid.x;
setp.lt.u32 %p2, %r2, 241;
and.pred %p3, %p1, %p2;
@%p3 bra $L__BB0_2;
bra.uni $L__BB0_1;

$L__BB0_2:
mov.u32 %r3, %ntid.x;
cvta.to.global.u64 %rd1, %rd5;
mov.f32 %f10, 0f00000000;
mov.u32 %r11, 0;
mov.u64 %rd8, cuda_motion;
mov.u32 %r20, %r11;

$L__BB0_3:
.pragma "nounroll";
add.s32 %r13, %r20, %r2;
shl.b32 %r14, %r20, 4;
mov.u32 %r15, 240;
sub.s32 %r16, %r15, %r14;
mad.lo.s32 %r21, %r13, %r3, %r1;
mul.wide.u32 %rd7, %r16, 4;
add.s64 %rd14, %rd8, %rd7;
mov.u32 %r22, %r11;

$L__BB0_4:
.pragma "nounroll";
cvt.u64.u32 %rd9, %r21;
add.s64 %rd10, %rd1, %rd9;
ld.global.u8 %rs1, [%rd10];
cvt.rn.f32.u16 %f7, %rs1;
ld.const.f32 %f8, [%rd14];
fma.rn.f32 %f10, %f8, %f7, %f10;
add.s32 %r21, %r21, 1;
add.s64 %rd14, %rd14, 4;
add.s32 %r22, %r22, 1;
setp.ne.s32 %p4, %r22, 16;
@%p4 bra $L__BB0_4;

add.s32 %r20, %r20, 1;
setp.lt.u32 %p5, %r20, 16;
@%p5 bra $L__BB0_3;
bra.uni $L__BB0_6;

$L__BB0_1:
mov.f32 %f10, 0f00000000;

$L__BB0_6:
cvt.rzi.u32.f32 %r17, %f10;
mov.u32 %r18, %ntid.x;
mad.lo.s32 %r19, %r2, %r18, %r1;
cvt.u64.u32 %rd11, %r19;
cvta.to.global.u64 %rd12, %rd6;
add.s64 %rd13, %rd12, %rd11;
st.global.u8 [%rd13], %r17;
ret;

}
//
.visible .entry _Z6Layer2PhS_(
.param .u64 _Z6Layer2PhS__param_0,
.param .u64 _Z6Layer2PhS__param_1
)
{
.reg .b16 %rs<2>;
.reg .b32 %r<8>;
.reg .b64 %rd<14>;


ld.param.u64 %rd1, [_Z6Layer2PhS__param_0];
ld.param.u64 %rd2, [_Z6Layer2PhS__param_1];
cvta.to.global.u64 %rd3, %rd2;
cvta.to.global.u64 %rd4, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %ntid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r1, %r2, %r3;
cvt.u64.u32 %rd5, %r4;
add.s64 %rd6, %rd4, %rd5;
ld.global.u8 %rs1, [%rd6];
cvt.u64.u32 %rd7, %r3;
mov.u64 %rd8, cuda_sbox;
add.s64 %rd9, %rd8, %rd7;
ld.const.u8 %r5, [%rd9];
cvt.u64.u32 %rd10, %r1;
add.s64 %rd11, %rd8, %rd10;
ld.const.u8 %r6, [%rd11];
mad.lo.s32 %r7, %r2, %r5, %r6;
cvt.u64.u32 %rd12, %r7;
add.s64 %rd13, %rd3, %rd12;
st.global.u8 [%rd13], %rs1;
ret;

}
//
.visible .entry _Z6Layer3PhS_(
.param .u64 _Z6Layer3PhS__param_0,
.param .u64 _Z6Layer3PhS__param_1
)
{
.reg .pred %p<5>;
.reg .b16 %rs<33>;
.reg .b32 %r<52>;
.reg .b64 %rd<24>;


ld.param.u64 %rd6, [_Z6Layer3PhS__param_0];
ld.param.u64 %rd5, [_Z6Layer3PhS__param_1];
mov.u32 %r21, %ntid.x;
mov.u32 %r1, %ctaid.x;
mul.lo.s32 %r49, %r1, %r21;
mov.u32 %r3, %tid.x;
add.s32 %r22, %r49, %r3;
cvt.u64.u32 %rd1, %r22;
cvta.to.global.u64 %rd2, %rd6;
add.s64 %rd3, %rd2, %rd1;
cvt.u16.u32 %rs8, %r3;
cvt.u16.u32 %rs9, %r1;
or.b16 %rs10, %rs9, %rs8;
ld.global.u8 %rs11, [%rd3];
xor.b16 %rs12, %rs11, %rs10;
st.global.u8 [%rd3], %rs12;
bar.sync 0;
and.b32 %r23, %r3, 7;
setp.ne.s32 %p1, %r23, 0;
@%p1 bra $L__BB2_4;

ld.global.u32 %r47, [%rd3+4];
ld.global.u32 %r48, [%rd3];
mov.u32 %r46, 1786956040;
mov.u32 %r45, 0;

$L__BB2_2:
.pragma "nounroll";
shl.b32 %r26, %r48, 4;
add.s32 %r27, %r26, 1386807340;
shr.u32 %r28, %r48, 5;
add.s32 %r29, %r28, 2007053320;
xor.b32 %r30, %r29, %r27;
add.s32 %r31, %r48, %r46;
xor.b32 %r32, %r30, %r31;
add.s32 %r47, %r32, %r47;
shl.b32 %r33, %r47, 4;
add.s32 %r34, %r33, 621668851;
add.s32 %r35, %r46, %r47;
xor.b32 %r36, %r34, %r35;
shr.u32 %r37, %r47, 5;
add.s32 %r38, %r37, -862448841;
xor.b32 %r39, %r36, %r38;
sub.s32 %r48, %r48, %r39;
add.s32 %r46, %r46, -1708609273;
add.s32 %r45, %r45, 1;
setp.ne.s32 %p2, %r45, 3238567;
@%p2 bra $L__BB2_2;

st.global.u32 [%rd3], %r48;
st.global.u32 [%rd3+4], %r47;

$L__BB2_4:
bar.sync 0;
and.b16 %rs16, %rs9, %rs8;
ld.global.u8 %rs17, [%rd3];
xor.b16 %rs18, %rs17, %rs16;
st.global.u8 [%rd3], %rs18;
bar.sync 0;
cvt.u64.u32 %rd7, %r3;
mov.u64 %rd8, cuda_sbox;
add.s64 %rd9, %rd8, %rd7;
ld.const.u8 %rs31, [%rd9];
cvta.to.global.u64 %rd4, %rd5;
mov.u16 %rs32, 0;
mov.u32 %r50, 0;
mov.u64 %rd14, cuda_tbox;

$L__BB2_5:
.pragma "nounroll";
cvt.u64.u32 %rd10, %r49;
add.s64 %rd11, %rd2, %rd10;
cvt.u64.u16 %rd12, %rs31;
and.b64 %rd13, %rd12, 255;
add.s64 %rd15, %rd14, %rd13;
ld.const.u8 %rs19, [%rd15];
ld.global.u8 %rs20, [%rd11];
mul.lo.s16 %rs21, %rs19, %rs20;
add.s16 %rs32, %rs21, %rs32;
mul.lo.s16 %rs22, %rs31, 5;
add.s16 %rs31, %rs22, 17;
add.s32 %r49, %r49, 1;
add.s32 %r50, %r50, 1;
setp.ne.s32 %p3, %r50, 256;
@%p3 bra $L__BB2_5;

xor.b32 %r18, %r1, %r3;
mov.u32 %r51, 8;

$L__BB2_7:
.pragma "nounroll";
shl.b16 %rs23, %rs32, 3;
and.b16 %rs24, %rs32, 224;
shr.u16 %rs25, %rs24, 5;
or.b16 %rs26, %rs25, %rs23;
cvt.u32.u16 %r42, %rs26;
mad.lo.s32 %r43, %r42, 13, %r18;
and.b32 %r44, %r51, 255;
cvt.u64.u32 %rd16, %r44;
add.s64 %rd18, %rd14, %rd16;
cvt.u16.u32 %rs27, %r43;
ld.const.u8 %rs28, [%rd18];
xor.b16 %rs29, %rs28, %rs27;
cvt.u64.u16 %rd19, %rs29;
and.b64 %rd20, %rd19, 255;
add.s64 %rd22, %rd8, %rd20;
ld.const.u8 %rs32, [%rd22];
add.s32 %r51, %r51, 1;
setp.ne.s32 %p4, %r51, 4137823;
@%p4 bra $L__BB2_7;

add.s64 %rd23, %rd4, %rd1;
st.global.u8 [%rd23], %rs32;
ret;

}

motion是一个浮点数组构成的模糊核:

1
0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01248378586024046, 0.04262230917811394, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01248378586024046, 0.04262230917811394, 0.01248378586024046, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01248378586024046, 0.04262230917811394, 0.01248378586024046, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01248378586024046, 0.04262230917811394, 0.01248378586024046, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01248378586024046, 0.04262230917811394, 0.01248378586024046, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01248378586024046, 0.04262230917811394, 0.01248378586024046, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0, 0.0, 0.0, 0.0, 0.01248378586024046, 0.04262230917811394, 0.01248378586024046, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01248378586024046, 0.04262230917811394, 0.01248378586024046, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01248378586024046, 0.04262230917811394, 0.01248378586024046, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01248378586024046, 0.04262230917811394, 0.01248378586024046, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01248378586024046, 0.04262230917811394, 0.01248378586024046, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01248378586024046, 0.04262230917811394, 0.01248378586024046, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01248378586024046, 0.04262230917811394, 0.01248378586024046, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01248378586024046, 0.04262230917811394, 0.01248378586024046, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0055596730671823025, 0.04262230917811394, 0.01248378586024046, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0055596730671823025, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

sbox:

1
D690E9FECCE13DB716B614C228FB2C052B679A762ABE04C3AA441326498606999C4250F491EF987A33540B43EDCFAC62E4B31CA9C908E89580DF94FA758F3FA64707A7FCF37317BA83593C19E6854FA8686B81B27164DA8BF8EB0F4B70569D351E240E5E6358D1A225227C3B01217887D40046579FD327524C3602E7A0C4C89EEABF8AD240C738B5A3F7F2CEF96115A1E0AE5DA49B341A55AD933230F58CB1E31DF6E22E8266CA60C02923AB0D534E6FD5DB3745DEFD8E2F03FF6A726D6C5B518D1BAF92BBDDBC7F11D95C411F105AD80AC13188A5CD7BBD2D74D012B8E5B4B08969974A0C96777E65B9F109C56EC68418F07DEC3ADC4D2079EE5F3ED7CB3948

tbox:

1
627C767AF26A6EC43000662AFED6AA76CA82C87CFA5846F0ACD4A2AE9CA472C0B6FC9226363EF6CC34A4E4F070D8301404C622C21896049A061280E2EA26B27408822C1A1A6E5AA0523AD6B228E22E8452D000EC20FCB05A6ACABE384A4C58CED0EEAAFA424C328444F8027E503C9EA850A2408E929C38F4BCB6DA2010FEF2D2CC0C12EC5E964416C4A67E3C645C187260804EDC222A908846EFB814DE5E0ADAE0323A0A4806245CC2D2AC629094E478E6C8366C8CD44EA86C56F4EA647AAE08BA78242E1CA6B4C6E8DC741E4ABC8A8A703EB4664802F60E603456B886C01C9EE0F8981068D88E949A1E86E8CE5428DE8CA0880CBEE6426840982C0EB054BA16

Layer1

逻辑恢复(by huanghunr):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
__global__ void Layer1(uint8_t* input, uint8_t* output) {
int tid = threadIdx.x; //当前线程index
int blockid = blockIdx.x; //当前block index
int ntid = blockDim.x; //block中线程数

if (tid >= 241 || blockid >= 241) {
output[blockid * ntid + tid] = 0;
return;
}

float acc = 0.0f;

for (int i = 0; i < 16; ++i) {
int offset = (240 - (i << 4)) * 4;
float* motion = (float*)((char*)cuda_motion + offset); //cuda_motion[240- i * 16] 把motion看成16*16的矩阵,这里就是在倒着获取每一行的索引

int idx = (blockid + i) * ntid + tid; //线程所处理的块索引
for (int j = 0; j < 16; ++j) {
acc += motion[j] * input[idx + j]; //与motion相乘并相加
}
}

int out_index = blockid * ntid + tid; //计算块索引用于线程的写入
output[out_index] = (uint8_t)(acc); //输出结果
}

Layer1中把输入的数据按照256*256的矩阵以motion为卷积核的卷积,但是motion的列要倒序一下,即input[0][0] … input[0][15] * motion[15][0]…motion[15][15].

Layer2

是一个简单的sbox置换:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
ld.param.u64 %rd1, [_Z6Layer2PhS__param_0];
ld.param.u64 %rd2, [_Z6Layer2PhS__param_1];
cvta.to.global.u64 %rd3, %rd2; // output
cvta.to.global.u64 %rd4, %rd1; // data
mov.u32 %r1, %ctaid.x; // block
mov.u32 %r2, %ntid.x;
mov.u32 %r3, %tid.x; // thread
mad.lo.s32 %r4, %r1, %r2, %r3; // pos = block * 256 + thread
cvt.u64.u32 %rd5, %r4;
add.s64 %rd6, %rd4, %rd5; //data[pos]的地址
ld.global.u8 %rs1, [%rd6]; // 读取data[pos]
cvt.u64.u32 %rd7, %r3; // thread
mov.u64 %rd8, cuda_sbox; // 加载 sbox
add.s64 %rd9, %rd8, %rd7; // sbox[thread]的地址
ld.const.u8 %r5, [%rd9]; // sbox[thread]
cvt.u64.u32 %rd10, %r1; // block
add.s64 %rd11, %rd8, %rd10; // sbox[block]的地址
ld.const.u8 %r6, [%rd11]; //sbox[block]
mad.lo.s32 %r7, %r2, %r5, %r6; // val = sbox[thread] * 256 + sbox[block]
cvt.u64.u32 %rd12, %r7;
add.s64 %rd13, %rd3, %rd12; // output[val]
st.global.u8 [%rd13], %rs1; // output[val] = data[pos]
ret;

python实现:

1
2
3
for block in range(256):
for thread in range(256):
output[sbox[thread] * 256 + sbox[block]] = data[block * 256 + thread]

Layer3

起始部分有一个异或:

1
2
3
4
5
6
cvt.u16.u32 %rs8, %r3; // 将线程索引转换为16位
cvt.u16.u32 %rs9, %r1; // 将块索引转换为16位
or.b16 %rs10, %rs9, %rs8; // %rs10 = %rs9 | %rs8
ld.global.u8 %rs11, [%rd3]; // 加载原数据
xor.b16 %rs12, %rs11, %rs10; // %rs12 = %rs11 ^ %rs10
st.global.u8 [%rd3], %rs12; // 写回原数据

其python实现:

1
2
3
4
for block in range(256):
for thread in range(256):
val = block | thread
data[256 * block + thread] ^= val

中间按线程索引&7=0(为8的倍数)进行TEA加密的部分:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
bar.sync 0;
and.b32 %r23, %r3, 7; // %r23 = 线程索引 & 7
setp.ne.s32 %p1, %r23, 0; // 如果不为0则跳转到$L_BB2_4
@%p1 bra $L__BB2_4;

ld.global.u32 %r47, [%rd3+4]; // m2
ld.global.u32 %r48, [%rd3]; // m1
mov.u32 %r46, 1786956040; // delta
mov.u32 %r45, 0; //循环轮数

$L__BB2_2:; //TEA
.pragma "nounroll";
shl.b32 %r26, %r48, 4;
add.s32 %r27, %r26, 1386807340;
shr.u32 %r28, %r48, 5;
add.s32 %r29, %r28, 2007053320;
xor.b32 %r30, %r29, %r27;
add.s32 %r31, %r48, %r46;
xor.b32 %r32, %r30, %r31;
add.s32 %r47, %r32, %r47;
shl.b32 %r33, %r47, 4;
add.s32 %r34, %r33, 621668851;
add.s32 %r35, %r46, %r47;
xor.b32 %r36, %r34, %r35;
shr.u32 %r37, %r47, 5;
add.s32 %r38, %r37, -862448841;
xor.b32 %r39, %r36, %r38;
sub.s32 %r48, %r48, %r39;
add.s32 %r46, %r46, -1708609273;
add.s32 %r45, %r45, 1;
setp.ne.s32 %p2, %r45, 3238567;
@%p2 bra $L__BB2_2;

st.global.u32 [%rd3], %r48; //写回原数据
st.global.u32 [%rd3+4], %r47; //写回原数据

其python模拟如下:

1
2
3
4
5
6
delta = 0x6a82c908

for i in range(0x316aa7):
m2 = (m2 + (((m1 >> 5) + 0x77a13408) ^ ((m1 << 4) + 0x52a9002c) ^ (m1 + delta))) & 0xFFFFFFFF
m1 = (m1 - (((m2 >> 5) + 0xcc981337) ^ ((m2 << 4) + 0x250de9f3) ^ (m2 + delta))) & 0xFFFFFFFF
delta = (delta + 0x9a28b107) & 0xFFFFFFFF

之后在$L_BB2_4有一个异或:

1
2
3
4
5
bar.sync 0;
and.b16 %rs16, %rs9, %rs8;
ld.global.u8 %rs17, [%rd3];
xor.b16 %rs18, %rs17, %rs16;
st.global.u8 [%rd3], %rs18;

对应:

1
2
3
4
for block in range(256):
for thread in range(256):
val = block & thread
data[256 * block + thread] ^= val

在$L_BB2_4的下半部分、$L_BB2_5和$L_BB2_7有Sbox、Tbox置换:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
bar.sync 0;
cvt.u64.u32 %rd7, %r3; //thread
mov.u64 %rd8, cuda_sbox;
add.s64 %rd9, %rd8, %rd7;
ld.const.u8 %rs31, [%rd9]; //加载sbox[thread]
cvta.to.global.u64 %rd4, %rd5;
mov.u16 %rs32, 0;
mov.u32 %r50, 0;
mov.u64 %rd14, cuda_tbox; //加载tbox

$L__BB2_5:
.pragma "nounroll";
cvt.u64.u32 %rd10, %r49;
add.s64 %rd11, %rd2, %rd10;
cvt.u64.u16 %rd12, %rs31; //sbox[thread]
and.b64 %rd13, %rd12, 255; //&0xFF
add.s64 %rd15, %rd14, %rd13; //tbox[sbox[thread]]的地址
ld.const.u8 %rs19, [%rd15]; // rs19为tbox[sbox[thread]]的值
ld.global.u8 %rs20, [%rd11]; //data
mul.lo.s16 %rs21, %rs19, %rs20; // tbox[sbox[thread]] * data
add.s16 %rs32, %rs21, %rs32; // 累加和
mul.lo.s16 %rs22, %rs31, 5; // sbox[thread] * 5
add.s16 %rs31, %rs22, 17; // sbox[thread] = sbox[thread] * 5 + 17
add.s32 %r49, %r49, 1; //计数+1
add.s32 %r50, %r50, 1;
setp.ne.s32 %p3, %r50, 256; //循环256次
@%p3 bra $L__BB2_5;

xor.b32 %r18, %r1, %r3;
mov.u32 %r51, 8;

$L__BB2_7:
.pragma "nounroll";
shl.b16 %rs23, %rs32, 3;
and.b16 %rs24, %rs32, 224;
shr.u16 %rs25, %rs24, 5;
or.b16 %rs26, %rs25, %rs23;
cvt.u32.u16 %r42, %rs26;
mad.lo.s32 %r43, %r42, 13, %r18;
and.b32 %r44, %r51, 255;
cvt.u64.u32 %rd16, %r44;
add.s64 %rd18, %rd14, %rd16; // tbox地址
cvt.u16.u32 %rs27, %r43;
ld.const.u8 %rs28, [%rd18];
xor.b16 %rs29, %rs28, %rs27;
cvt.u64.u16 %rd19, %rs29;
and.b64 %rd20, %rd19, 255;
add.s64 %rd22, %rd8, %rd20; // sbox地址
ld.const.u8 %rs32, [%rd22];
add.s32 %r51, %r51, 1;
setp.ne.s32 %p4, %r51, 4137823;
@%p4 bra $L__BB2_7;

add.s64 %rd23, %rd4, %rd1;
st.global.u8 [%rd23], %rs32;
ret;

对应:

1
2
3
4
5
6
7
8
9
for block in range(256):
for thread in range(256):
ttl = 0
for rounds in range(256):
ttl += tbox[sbox[thread]] * b_dat[256 * block + rounds]
sbox[thread] = (sbox[thread] * 5 + 17) & 0xFF
for cycle in range(8, 4137823):
res = sbox[tbox[cycle & 0xFF] ^ ((((ttl & 224) >> 5) | (ttl << 3)) * 13 + (block ^ thread)) & 0xFF]
final[block * 256 + thread] = res

Layer3总体逻辑:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def bytes_to_dwords_little_endian(byte_array):
return [int.from_bytes(byte_array[i:i+4], byteorder='little', signed=False)for i in range(0, len(byte_array), 4)]

def dwords_to_bytes_little_endian(dword_array):
byte_list = []
for dword in dword_array:
byte_list.extend(dword.to_bytes(4, byteorder='little'))
return byte_list

sbox_ori = [0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6, 0x14, 0xC2, 0x28, 0xFB, 0x2C, 0x05,
0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3, 0xAA, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99,
0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A, 0x33, 0x54, 0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62,
0xE4, 0xB3, 0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95, 0x80, 0xDF, 0x94, 0xFA, 0x75, 0x8F, 0x3F, 0xA6,
0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73, 0x17, 0xBA, 0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8,
0x68, 0x6B, 0x81, 0xB2, 0x71, 0x64, 0xDA, 0x8B, 0xF8, 0xEB, 0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35,
0x1E, 0x24, 0x0E, 0x5E, 0x63, 0x58, 0xD1, 0xA2, 0x25, 0x22, 0x7C, 0x3B, 0x01, 0x21, 0x78, 0x87,
0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52, 0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4, 0xC8, 0x9E,
0xEA, 0xBF, 0x8A, 0xD2, 0x40, 0xC7, 0x38, 0xB5, 0xA3, 0xF7, 0xF2, 0xCE, 0xF9, 0x61, 0x15, 0xA1,
0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34, 0x1A, 0x55, 0xAD, 0x93, 0x32, 0x30, 0xF5, 0x8C, 0xB1, 0xE3,
0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60, 0xC0, 0x29, 0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F,
0xD5, 0xDB, 0x37, 0x45, 0xDE, 0xFD, 0x8E, 0x2F, 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C, 0x5B, 0x51,
0x8D, 0x1B, 0xAF, 0x92, 0xBB, 0xDD, 0xBC, 0x7F, 0x11, 0xD9, 0x5C, 0x41, 0x1F, 0x10, 0x5A, 0xD8,
0x0A, 0xC1, 0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD, 0x2D, 0x74, 0xD0, 0x12, 0xB8, 0xE5, 0xB4, 0xB0,
0x89, 0x69, 0x97, 0x4A, 0x0C, 0x96, 0x77, 0x7E, 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E, 0xC6, 0x84,
0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E, 0xD7, 0xCB, 0x39, 0x48]

tbox = [0x62, 0x7C, 0x76, 0x7A, 0xF2, 0x6A, 0x6E, 0xC4, 0x30, 0x00, 0x66, 0x2A, 0xFE, 0xD6, 0xAA, 0x76,
0xCA, 0x82, 0xC8, 0x7C, 0xFA, 0x58, 0x46, 0xF0, 0xAC, 0xD4, 0xA2, 0xAE, 0x9C, 0xA4, 0x72, 0xC0,
0xB6, 0xFC, 0x92, 0x26, 0x36, 0x3E, 0xF6, 0xCC, 0x34, 0xA4, 0xE4, 0xF0, 0x70, 0xD8, 0x30, 0x14,
0x04, 0xC6, 0x22, 0xC2, 0x18, 0x96, 0x04, 0x9A, 0x06, 0x12, 0x80, 0xE2, 0xEA, 0x26, 0xB2, 0x74,
0x08, 0x82, 0x2C, 0x1A, 0x1A, 0x6E, 0x5A, 0xA0, 0x52, 0x3A, 0xD6, 0xB2, 0x28, 0xE2, 0x2E, 0x84,
0x52, 0xD0, 0x00, 0xEC, 0x20, 0xFC, 0xB0, 0x5A, 0x6A, 0xCA, 0xBE, 0x38, 0x4A, 0x4C, 0x58, 0xCE,
0xD0, 0xEE, 0xAA, 0xFA, 0x42, 0x4C, 0x32, 0x84, 0x44, 0xF8, 0x02, 0x7E, 0x50, 0x3C, 0x9E, 0xA8,
0x50, 0xA2, 0x40, 0x8E, 0x92, 0x9C, 0x38, 0xF4, 0xBC, 0xB6, 0xDA, 0x20, 0x10, 0xFE, 0xF2, 0xD2,
0xCC, 0x0C, 0x12, 0xEC, 0x5E, 0x96, 0x44, 0x16, 0xC4, 0xA6, 0x7E, 0x3C, 0x64, 0x5C, 0x18, 0x72,
0x60, 0x80, 0x4E, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEF, 0xB8, 0x14, 0xDE, 0x5E, 0x0A, 0xDA,
0xE0, 0x32, 0x3A, 0x0A, 0x48, 0x06, 0x24, 0x5C, 0xC2, 0xD2, 0xAC, 0x62, 0x90, 0x94, 0xE4, 0x78,
0xE6, 0xC8, 0x36, 0x6C, 0x8C, 0xD4, 0x4E, 0xA8, 0x6C, 0x56, 0xF4, 0xEA, 0x64, 0x7A, 0xAE, 0x08,
0xBA, 0x78, 0x24, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDC, 0x74, 0x1E, 0x4A, 0xBC, 0x8A, 0x8A,
0x70, 0x3E, 0xB4, 0x66, 0x48, 0x02, 0xF6, 0x0E, 0x60, 0x34, 0x56, 0xB8, 0x86, 0xC0, 0x1C, 0x9E,
0xE0, 0xF8, 0x98, 0x10, 0x68, 0xD8, 0x8E, 0x94, 0x9A, 0x1E, 0x86, 0xE8, 0xCE, 0x54, 0x28, 0xDE,
0x8C, 0xA0, 0x88, 0x0C, 0xBE, 0xE6, 0x42, 0x68, 0x40, 0x98, 0x2C, 0x0E, 0xB0, 0x54, 0xBA, 0x16]

data = [0] * 65536
final = [0] * 65536

for block in range(256):
for thread in range(256):
val = block | thread
data[256 * block + thread] ^= val

d_dat = bytes_to_dwords_little_endian(data)

for i in range(len(d_dat) // 2):
m1 = d_dat[2 * i]
m2 = d_dat[2 * i + 1]
ttl = 0x6a82c908
for _ in range(3238567):
m2 = (m2 + (((m1 >> 5) + 0x77a13408) ^ ((m1 << 4) + 0x52a9002c) ^ (m1 + ttl))) & 0xFFFFFFFF
m1 = (m1 - (((m2 >> 5) + 0xcc981337) ^ ((m2 << 4) + 0x250de9f3) ^ (m2 + ttl))) & 0xFFFFFFFF
ttl = (ttl + 0x9a28b107) & 0xFFFFFFFF
d_dat[2 * i] = m1
d_dat[2 * i + 1] = m2

b_dat = dwords_to_bytes_little_endian(d_dat)

for block in range(256):
for thread in range(256):
val = block & thread
b_dat[256 * block + thread] ^= val

for block in range(256):
for thread in range(256):
sbox = sbox_ori.copy()
ttl = 0
for rounds in range(256):
ttl += tbox[sbox[thread]] * b_dat[256 * block + rounds]
sbox[thread] = (sbox[thread] * 5 + 17) & 0xFF
for cycle in range(8, 4137823):
ttl = sbox[tbox[cycle & 0xFF] ^ ((((ttl & 224) >> 5) | (ttl << 3)) * 13 + (block ^ thread)) & 0xFF]
final[block * 256 + thread] = ttl

Layer3求解:

经过测试发现ttl生成过程中的sbox修改对于thread0~255,在256轮后的结果都一样,并且等于原始sbox(SM4的sbox):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
sbox_ori = [0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6, 0x14, 0xC2, 0x28, 0xFB, 0x2C, 0x05,
0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3, 0xAA, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99,
0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A, 0x33, 0x54, 0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62,
0xE4, 0xB3, 0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95, 0x80, 0xDF, 0x94, 0xFA, 0x75, 0x8F, 0x3F, 0xA6,
0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73, 0x17, 0xBA, 0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8,
0x68, 0x6B, 0x81, 0xB2, 0x71, 0x64, 0xDA, 0x8B, 0xF8, 0xEB, 0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35,
0x1E, 0x24, 0x0E, 0x5E, 0x63, 0x58, 0xD1, 0xA2, 0x25, 0x22, 0x7C, 0x3B, 0x01, 0x21, 0x78, 0x87,
0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52, 0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4, 0xC8, 0x9E,
0xEA, 0xBF, 0x8A, 0xD2, 0x40, 0xC7, 0x38, 0xB5, 0xA3, 0xF7, 0xF2, 0xCE, 0xF9, 0x61, 0x15, 0xA1,
0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34, 0x1A, 0x55, 0xAD, 0x93, 0x32, 0x30, 0xF5, 0x8C, 0xB1, 0xE3,
0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60, 0xC0, 0x29, 0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F,
0xD5, 0xDB, 0x37, 0x45, 0xDE, 0xFD, 0x8E, 0x2F, 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C, 0x5B, 0x51,
0x8D, 0x1B, 0xAF, 0x92, 0xBB, 0xDD, 0xBC, 0x7F, 0x11, 0xD9, 0x5C, 0x41, 0x1F, 0x10, 0x5A, 0xD8,
0x0A, 0xC1, 0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD, 0x2D, 0x74, 0xD0, 0x12, 0xB8, 0xE5, 0xB4, 0xB0,
0x89, 0x69, 0x97, 0x4A, 0x0C, 0x96, 0x77, 0x7E, 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E, 0xC6, 0x84,
0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E, 0xD7, 0xCB, 0x39, 0x48]

sboxs = []
for thread in range(256):
sbox = sbox_ori.copy()
ttl = 0
for rounds in range(256):
sbox[thread] = (sbox[thread] * 5 + 17) & 0xFF
sboxs.append(sbox.copy())
for i in range(256):
assert sbox_ori == sboxs[i] # assert通过验证

cycle过程中的&224无效,因为取的是高3位,逆向最后的cycle循环得到解密函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from numba import njit
import numpy as np

sbox = np.array([0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6, 0x14, 0xC2, 0x28, 0xFB, 0x2C, 0x05,
0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3, 0xAA, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99,
0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A, 0x33, 0x54, 0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62,
0xE4, 0xB3, 0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95, 0x80, 0xDF, 0x94, 0xFA, 0x75, 0x8F, 0x3F, 0xA6,
0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73, 0x17, 0xBA, 0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8,
0x68, 0x6B, 0x81, 0xB2, 0x71, 0x64, 0xDA, 0x8B, 0xF8, 0xEB, 0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35,
0x1E, 0x24, 0x0E, 0x5E, 0x63, 0x58, 0xD1, 0xA2, 0x25, 0x22, 0x7C, 0x3B, 0x01, 0x21, 0x78, 0x87,
0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52, 0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4, 0xC8, 0x9E,
0xEA, 0xBF, 0x8A, 0xD2, 0x40, 0xC7, 0x38, 0xB5, 0xA3, 0xF7, 0xF2, 0xCE, 0xF9, 0x61, 0x15, 0xA1,
0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34, 0x1A, 0x55, 0xAD, 0x93, 0x32, 0x30, 0xF5, 0x8C, 0xB1, 0xE3,
0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60, 0xC0, 0x29, 0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F,
0xD5, 0xDB, 0x37, 0x45, 0xDE, 0xFD, 0x8E, 0x2F, 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C, 0x5B, 0x51,
0x8D, 0x1B, 0xAF, 0x92, 0xBB, 0xDD, 0xBC, 0x7F, 0x11, 0xD9, 0x5C, 0x41, 0x1F, 0x10, 0x5A, 0xD8,
0x0A, 0xC1, 0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD, 0x2D, 0x74, 0xD0, 0x12, 0xB8, 0xE5, 0xB4, 0xB0,
0x89, 0x69, 0x97, 0x4A, 0x0C, 0x96, 0x77, 0x7E, 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E, 0xC6, 0x84,
0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E, 0xD7, 0xCB, 0x39, 0x48], dtype=np.uint8)

tbox = np.array([0x62, 0x7C, 0x76, 0x7A, 0xF2, 0x6A, 0x6E, 0xC4, 0x30, 0x00, 0x66, 0x2A, 0xFE, 0xD6, 0xAA, 0x76,
0xCA, 0x82, 0xC8, 0x7C, 0xFA, 0x58, 0x46, 0xF0, 0xAC, 0xD4, 0xA2, 0xAE, 0x9C, 0xA4, 0x72, 0xC0,
0xB6, 0xFC, 0x92, 0x26, 0x36, 0x3E, 0xF6, 0xCC, 0x34, 0xA4, 0xE4, 0xF0, 0x70, 0xD8, 0x30, 0x14,
0x04, 0xC6, 0x22, 0xC2, 0x18, 0x96, 0x04, 0x9A, 0x06, 0x12, 0x80, 0xE2, 0xEA, 0x26, 0xB2, 0x74,
0x08, 0x82, 0x2C, 0x1A, 0x1A, 0x6E, 0x5A, 0xA0, 0x52, 0x3A, 0xD6, 0xB2, 0x28, 0xE2, 0x2E, 0x84,
0x52, 0xD0, 0x00, 0xEC, 0x20, 0xFC, 0xB0, 0x5A, 0x6A, 0xCA, 0xBE, 0x38, 0x4A, 0x4C, 0x58, 0xCE,
0xD0, 0xEE, 0xAA, 0xFA, 0x42, 0x4C, 0x32, 0x84, 0x44, 0xF8, 0x02, 0x7E, 0x50, 0x3C, 0x9E, 0xA8,
0x50, 0xA2, 0x40, 0x8E, 0x92, 0x9C, 0x38, 0xF4, 0xBC, 0xB6, 0xDA, 0x20, 0x10, 0xFE, 0xF2, 0xD2,
0xCC, 0x0C, 0x12, 0xEC, 0x5E, 0x96, 0x44, 0x16, 0xC4, 0xA6, 0x7E, 0x3C, 0x64, 0x5C, 0x18, 0x72,
0x60, 0x80, 0x4E, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEF, 0xB8, 0x14, 0xDE, 0x5E, 0x0A, 0xDA,
0xE0, 0x32, 0x3A, 0x0A, 0x48, 0x06, 0x24, 0x5C, 0xC2, 0xD2, 0xAC, 0x62, 0x90, 0x94, 0xE4, 0x78,
0xE6, 0xC8, 0x36, 0x6C, 0x8C, 0xD4, 0x4E, 0xA8, 0x6C, 0x56, 0xF4, 0xEA, 0x64, 0x7A, 0xAE, 0x08,
0xBA, 0x78, 0x24, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDC, 0x74, 0x1E, 0x4A, 0xBC, 0x8A, 0x8A,
0x70, 0x3E, 0xB4, 0x66, 0x48, 0x02, 0xF6, 0x0E, 0x60, 0x34, 0x56, 0xB8, 0x86, 0xC0, 0x1C, 0x9E,
0xE0, 0xF8, 0x98, 0x10, 0x68, 0xD8, 0x8E, 0x94, 0x9A, 0x1E, 0x86, 0xE8, 0xCE, 0x54, 0x28, 0xDE,
0x8C, 0xA0, 0x88, 0x0C, 0xBE, 0xE6, 0x42, 0x68, 0x40, 0x98, 0x2C, 0x0E, 0xB0, 0x54, 0xBA, 0x16], dtype=np.uint8)

invs = np.array([0x71, 0x6c, 0x7a, 0xb8, 0x16, 0x0f, 0x1e, 0x41, 0x35, 0xeb, 0xd0, 0x2a, 0xe4, 0xac, 0x62, 0x5a,
0xcd, 0xc8, 0xdb, 0x1a, 0x0a, 0x8e, 0x08, 0x46, 0xf0, 0x4b, 0x96, 0xc1, 0x32, 0xa0, 0x60, 0xcc,
0xf7, 0x6d, 0x69, 0xaa, 0x61, 0x68, 0x1b, 0x76, 0x0c, 0xa9, 0x14, 0x10, 0x0e, 0xd8, 0xa3, 0xb7,
0x9b, 0xd2, 0x9a, 0x28, 0x95, 0x5f, 0x79, 0xb2, 0x86, 0xfe, 0xf4, 0x6b, 0x4a, 0x06, 0xfb, 0x3e,
0x84, 0xcb, 0x21, 0x2b, 0x19, 0xb3, 0x72, 0x40, 0xff, 0x1c, 0xe3, 0x5b, 0x78, 0xf6, 0xae, 0x4e,
0x22, 0xbf, 0x77, 0xad, 0x29, 0x97, 0x5d, 0x73, 0x65, 0x49, 0xce, 0xbe, 0xca, 0x92, 0x63, 0xfa,
0xa7, 0x8d, 0x2f, 0x64, 0x55, 0xe8, 0xa5, 0x11, 0x50, 0xe1, 0xba, 0x51, 0xbd, 0xbc, 0xed, 0xaf,
0x5c, 0x54, 0xbb, 0x45, 0xd9, 0x3c, 0x13, 0xe6, 0x6e, 0xf8, 0x27, 0xd6, 0x6a, 0xf2, 0xe7, 0xc7,
0x38, 0x52, 0xa4, 0x48, 0xef, 0x4d, 0x1d, 0x6f, 0xd3, 0xe0, 0x82, 0x57, 0x9d, 0xc0, 0xb6, 0x3d,
0x01, 0x24, 0xc3, 0x99, 0x3a, 0x37, 0xe5, 0xe2, 0x26, 0x1f, 0x12, 0x94, 0x20, 0x5e, 0x7f, 0x74,
0x7c, 0x8f, 0x67, 0x88, 0x93, 0xd4, 0x3f, 0x42, 0x4f, 0x33, 0x18, 0xab, 0x2e, 0x98, 0x91, 0xc2,
0xdf, 0x9e, 0x53, 0x31, 0xde, 0x87, 0x09, 0x07, 0xdc, 0xe9, 0x47, 0xc4, 0xc6, 0xd7, 0x15, 0x81,
0xa8, 0xd1, 0x0b, 0x17, 0x7d, 0xec, 0xee, 0x85, 0x7e, 0x34, 0xa6, 0xfd, 0x04, 0xd5, 0x8b, 0x2d,
0xda, 0x66, 0x83, 0x75, 0x70, 0xb0, 0x00, 0xfc, 0xcf, 0xc9, 0x56, 0xb1, 0xf5, 0xc5, 0xb4, 0x39,
0x90, 0x05, 0xa2, 0x9f, 0x30, 0xdd, 0x4c, 0x7b, 0x36, 0x02, 0x80, 0x59, 0xf3, 0x2c, 0xf9, 0x25,
0xf1, 0xea, 0x8a, 0x44, 0x23, 0x9c, 0xa1, 0x89, 0x58, 0x8c, 0x3b, 0x0d, 0x43, 0xb5, 0x03, 0xb9], dtype=np.uint8)

with open("/(2025.04.26-2025.04.27) ACTF 2025/deeptx/3逆向cycle/encflag.txt", "r") as file:
encflag = file.read().strip()
final = np.frombuffer(bytearray.fromhex(encflag), dtype=np.uint8)

@njit
def dec_cycle(final, invs, tbox):
for block in range(256):
for thread in range(256):
for cycle in range(4137822, 7, -1):
final[256 * block + thread] = ((((((invs[final[256 * block + thread]] ^ tbox[cycle & 0xFF]) - (block ^ thread)) * 197) & 0xFF) << 5) | (((((invs[final[256 * block + thread]] ^ tbox[cycle & 0xFF]) - (block ^ thread)) * 197) & 0xFF) >> 3)) & 0xFF
print(256 * block + thread)
return final

final = dec_cycle(final, invs, tbox)
print(final.tobytes().hex())

恢复出ttl,接下来的部分要解矩阵,注意到在一个block内,ttl是256字节的data和系数的乘积的加和,data可以看做一个256维向量,256线程的256轮的系数可以看做一个256阶的系数矩阵,矩阵乘以向量得到输出ttl,使用z3求解data:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import numpy as np
from z3 import *
from tqdm import tqdm

with open("/(2025.04.26-2025.04.27) ACTF 2025/deeptx/3逆向cycle/encflag_decycle.txt", "r") as file:
encfs = file.read().strip()
encf = np.array(bytearray.fromhex(encfs), dtype=np.uint8)
data = encf.reshape((256, 256))

sbox_ori = [0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6, 0x14, 0xC2, 0x28, 0xFB, 0x2C, 0x05,
0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3, 0xAA, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99,
0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A, 0x33, 0x54, 0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62,
0xE4, 0xB3, 0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95, 0x80, 0xDF, 0x94, 0xFA, 0x75, 0x8F, 0x3F, 0xA6,
0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73, 0x17, 0xBA, 0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8,
0x68, 0x6B, 0x81, 0xB2, 0x71, 0x64, 0xDA, 0x8B, 0xF8, 0xEB, 0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35,
0x1E, 0x24, 0x0E, 0x5E, 0x63, 0x58, 0xD1, 0xA2, 0x25, 0x22, 0x7C, 0x3B, 0x01, 0x21, 0x78, 0x87,
0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52, 0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4, 0xC8, 0x9E,
0xEA, 0xBF, 0x8A, 0xD2, 0x40, 0xC7, 0x38, 0xB5, 0xA3, 0xF7, 0xF2, 0xCE, 0xF9, 0x61, 0x15, 0xA1,
0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34, 0x1A, 0x55, 0xAD, 0x93, 0x32, 0x30, 0xF5, 0x8C, 0xB1, 0xE3,
0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60, 0xC0, 0x29, 0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F,
0xD5, 0xDB, 0x37, 0x45, 0xDE, 0xFD, 0x8E, 0x2F, 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C, 0x5B, 0x51,
0x8D, 0x1B, 0xAF, 0x92, 0xBB, 0xDD, 0xBC, 0x7F, 0x11, 0xD9, 0x5C, 0x41, 0x1F, 0x10, 0x5A, 0xD8,
0x0A, 0xC1, 0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD, 0x2D, 0x74, 0xD0, 0x12, 0xB8, 0xE5, 0xB4, 0xB0,
0x89, 0x69, 0x97, 0x4A, 0x0C, 0x96, 0x77, 0x7E, 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E, 0xC6, 0x84,
0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E, 0xD7, 0xCB, 0x39, 0x48]

tbox = [0x62, 0x7C, 0x76, 0x7A, 0xF2, 0x6A, 0x6E, 0xC4, 0x30, 0x00, 0x66, 0x2A, 0xFE, 0xD6, 0xAA, 0x76,
0xCA, 0x82, 0xC8, 0x7C, 0xFA, 0x58, 0x46, 0xF0, 0xAC, 0xD4, 0xA2, 0xAE, 0x9C, 0xA4, 0x72, 0xC0,
0xB6, 0xFC, 0x92, 0x26, 0x36, 0x3E, 0xF6, 0xCC, 0x34, 0xA4, 0xE4, 0xF0, 0x70, 0xD8, 0x30, 0x14,
0x04, 0xC6, 0x22, 0xC2, 0x18, 0x96, 0x04, 0x9A, 0x06, 0x12, 0x80, 0xE2, 0xEA, 0x26, 0xB2, 0x74,
0x08, 0x82, 0x2C, 0x1A, 0x1A, 0x6E, 0x5A, 0xA0, 0x52, 0x3A, 0xD6, 0xB2, 0x28, 0xE2, 0x2E, 0x84,
0x52, 0xD0, 0x00, 0xEC, 0x20, 0xFC, 0xB0, 0x5A, 0x6A, 0xCA, 0xBE, 0x38, 0x4A, 0x4C, 0x58, 0xCE,
0xD0, 0xEE, 0xAA, 0xFA, 0x42, 0x4C, 0x32, 0x84, 0x44, 0xF8, 0x02, 0x7E, 0x50, 0x3C, 0x9E, 0xA8,
0x50, 0xA2, 0x40, 0x8E, 0x92, 0x9C, 0x38, 0xF4, 0xBC, 0xB6, 0xDA, 0x20, 0x10, 0xFE, 0xF2, 0xD2,
0xCC, 0x0C, 0x12, 0xEC, 0x5E, 0x96, 0x44, 0x16, 0xC4, 0xA6, 0x7E, 0x3C, 0x64, 0x5C, 0x18, 0x72,
0x60, 0x80, 0x4E, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEF, 0xB8, 0x14, 0xDE, 0x5E, 0x0A, 0xDA,
0xE0, 0x32, 0x3A, 0x0A, 0x48, 0x06, 0x24, 0x5C, 0xC2, 0xD2, 0xAC, 0x62, 0x90, 0x94, 0xE4, 0x78,
0xE6, 0xC8, 0x36, 0x6C, 0x8C, 0xD4, 0x4E, 0xA8, 0x6C, 0x56, 0xF4, 0xEA, 0x64, 0x7A, 0xAE, 0x08,
0xBA, 0x78, 0x24, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDC, 0x74, 0x1E, 0x4A, 0xBC, 0x8A, 0x8A,
0x70, 0x3E, 0xB4, 0x66, 0x48, 0x02, 0xF6, 0x0E, 0x60, 0x34, 0x56, 0xB8, 0x86, 0xC0, 0x1C, 0x9E,
0xE0, 0xF8, 0x98, 0x10, 0x68, 0xD8, 0x8E, 0x94, 0x9A, 0x1E, 0x86, 0xE8, 0xCE, 0x54, 0x28, 0xDE,
0x8C, 0xA0, 0x88, 0x0C, 0xBE, 0xE6, 0x42, 0x68, 0x40, 0x98, 0x2C, 0x0E, 0xB0, 0x54, 0xBA, 0x16]

C = [[0] * 256 for _ in range(256)]
for thread in range(256):
sbox = sbox_ori.copy()
for rounds in range(256):
C[thread][rounds] = tbox[sbox[thread]]
sbox[thread] = (sbox[thread] * 5 + 17) & 0xFF

all_b_dat = []
for block_idx in tqdm(range(256), desc="Processing Blocks"):
ttl = data[block_idx]
solver = Solver()
b_dat = [BitVec(f'b_dat_{i}', 8) for i in range(256)]
for thread in range(256):
linear_combination = sum(C[thread][rounds] * b_dat[rounds] for rounds in range(256))
solver.add(linear_combination % 256 == ttl[thread])
if solver.check() == sat:
model = solver.model()
b_dat_solution = [model[b_dat[i]].as_long() for i in range(256)]
all_b_dat.extend(b_dat_solution)
is_correct = True
for thread in range(256):
sbox = sbox_ori.copy()
calculated_ttl = 0
for rounds in range(256):
calculated_ttl += tbox[sbox[thread]] * b_dat_solution[rounds]
sbox[thread] = (sbox[thread] * 5 + 17) & 0xFF
if (calculated_ttl & 0xFF) != ttl[thread]:
is_correct = False
break
print(bytearray(b_dat_solution).hex())
print(f"Block {block_idx}: {'Correct' if is_correct else 'Incorrect'}")
else:
print(f"Block {block_idx}: No solution found")

all_b_dat = np.array(all_b_dat, dtype=np.uint8)
np.save("recovered_b_dat.npy", all_b_dat)
print("All b_dat recovered and saved to 'recovered_b_dat.npy'")

接下来对layer3剩余的两个异或和TEA加密进行逆向:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from numba import njit
import numpy as np

def bytes_to_dwords_little_endian(byte_array):
return [int.from_bytes(byte_array[i:i+4], byteorder='little', signed=False)for i in range(0, len(byte_array), 4)]

def dwords_to_bytes_little_endian(dword_array):
byte_list = []
for dword in dword_array:
byte_list.extend(dword.to_bytes(4, byteorder='little'))
return byte_list

bdat = np.load("/(2025.04.26-2025.04.27) ACTF 2025/deeptx/4逆向boxexchange/recovered_b_dat.npy")

for block in range(256):
for thread in range(256):
val = block & thread
bdat[256 * block + thread] ^= val

print(bdat)
d_dat = bytes_to_dwords_little_endian(bdat)

@njit
def tea_dec(d_dat):
for i in range(len(d_dat) // 2):
print(i)
m1 = d_dat[2 * i]
m2 = d_dat[2 * i + 1]
ttl = (0x6a82c908 + 0x9a28b107 * 3238567) & 0xFFFFFFFF
for _ in range(3238567):
ttl = (ttl - 0x9a28b107) & 0xFFFFFFFF
m1 = (m1 + (((m2 >> 5) + 0xcc981337) ^ ((m2 << 4) + 0x250de9f3) ^ (m2 + ttl))) & 0xFFFFFFFF
m2 = (m2 - (((m1 >> 5) + 0x77a13408) ^ ((m1 << 4) + 0x52a9002c) ^ (m1 + ttl))) & 0xFFFFFFFF
d_dat[2 * i] = m1
d_dat[2 * i + 1] = m2
return d_dat

d_dat = tea_dec(d_dat)
b_dat = dwords_to_bytes_little_endian(d_dat)

for block in range(256):
for thread in range(256):
val = block | thread
b_dat[256 * block + thread] ^= val

np.save("enc_layer2.npy", b_dat)
print("All b_dat recovered and saved to 'enc_layer2.npy'")

得到layer2加密后的结果,还原layer2并验证:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import numpy as np

sbox = np.array([0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6, 0x14, 0xC2, 0x28, 0xFB, 0x2C, 0x05,
0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3, 0xAA, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99,
0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A, 0x33, 0x54, 0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62,
0xE4, 0xB3, 0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95, 0x80, 0xDF, 0x94, 0xFA, 0x75, 0x8F, 0x3F, 0xA6,
0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73, 0x17, 0xBA, 0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8,
0x68, 0x6B, 0x81, 0xB2, 0x71, 0x64, 0xDA, 0x8B, 0xF8, 0xEB, 0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35,
0x1E, 0x24, 0x0E, 0x5E, 0x63, 0x58, 0xD1, 0xA2, 0x25, 0x22, 0x7C, 0x3B, 0x01, 0x21, 0x78, 0x87,
0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52, 0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4, 0xC8, 0x9E,
0xEA, 0xBF, 0x8A, 0xD2, 0x40, 0xC7, 0x38, 0xB5, 0xA3, 0xF7, 0xF2, 0xCE, 0xF9, 0x61, 0x15, 0xA1,
0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34, 0x1A, 0x55, 0xAD, 0x93, 0x32, 0x30, 0xF5, 0x8C, 0xB1, 0xE3,
0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60, 0xC0, 0x29, 0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F,
0xD5, 0xDB, 0x37, 0x45, 0xDE, 0xFD, 0x8E, 0x2F, 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C, 0x5B, 0x51,
0x8D, 0x1B, 0xAF, 0x92, 0xBB, 0xDD, 0xBC, 0x7F, 0x11, 0xD9, 0x5C, 0x41, 0x1F, 0x10, 0x5A, 0xD8,
0x0A, 0xC1, 0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD, 0x2D, 0x74, 0xD0, 0x12, 0xB8, 0xE5, 0xB4, 0xB0,
0x89, 0x69, 0x97, 0x4A, 0x0C, 0x96, 0x77, 0x7E, 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E, 0xC6, 0x84,
0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E, 0xD7, 0xCB, 0x39, 0x48], dtype=np.uint8)

invs = np.array([0x71, 0x6c, 0x7a, 0xb8, 0x16, 0x0f, 0x1e, 0x41, 0x35, 0xeb, 0xd0, 0x2a, 0xe4, 0xac, 0x62, 0x5a,
0xcd, 0xc8, 0xdb, 0x1a, 0x0a, 0x8e, 0x08, 0x46, 0xf0, 0x4b, 0x96, 0xc1, 0x32, 0xa0, 0x60, 0xcc,
0xf7, 0x6d, 0x69, 0xaa, 0x61, 0x68, 0x1b, 0x76, 0x0c, 0xa9, 0x14, 0x10, 0x0e, 0xd8, 0xa3, 0xb7,
0x9b, 0xd2, 0x9a, 0x28, 0x95, 0x5f, 0x79, 0xb2, 0x86, 0xfe, 0xf4, 0x6b, 0x4a, 0x06, 0xfb, 0x3e,
0x84, 0xcb, 0x21, 0x2b, 0x19, 0xb3, 0x72, 0x40, 0xff, 0x1c, 0xe3, 0x5b, 0x78, 0xf6, 0xae, 0x4e,
0x22, 0xbf, 0x77, 0xad, 0x29, 0x97, 0x5d, 0x73, 0x65, 0x49, 0xce, 0xbe, 0xca, 0x92, 0x63, 0xfa,
0xa7, 0x8d, 0x2f, 0x64, 0x55, 0xe8, 0xa5, 0x11, 0x50, 0xe1, 0xba, 0x51, 0xbd, 0xbc, 0xed, 0xaf,
0x5c, 0x54, 0xbb, 0x45, 0xd9, 0x3c, 0x13, 0xe6, 0x6e, 0xf8, 0x27, 0xd6, 0x6a, 0xf2, 0xe7, 0xc7,
0x38, 0x52, 0xa4, 0x48, 0xef, 0x4d, 0x1d, 0x6f, 0xd3, 0xe0, 0x82, 0x57, 0x9d, 0xc0, 0xb6, 0x3d,
0x01, 0x24, 0xc3, 0x99, 0x3a, 0x37, 0xe5, 0xe2, 0x26, 0x1f, 0x12, 0x94, 0x20, 0x5e, 0x7f, 0x74,
0x7c, 0x8f, 0x67, 0x88, 0x93, 0xd4, 0x3f, 0x42, 0x4f, 0x33, 0x18, 0xab, 0x2e, 0x98, 0x91, 0xc2,
0xdf, 0x9e, 0x53, 0x31, 0xde, 0x87, 0x09, 0x07, 0xdc, 0xe9, 0x47, 0xc4, 0xc6, 0xd7, 0x15, 0x81,
0xa8, 0xd1, 0x0b, 0x17, 0x7d, 0xec, 0xee, 0x85, 0x7e, 0x34, 0xa6, 0xfd, 0x04, 0xd5, 0x8b, 0x2d,
0xda, 0x66, 0x83, 0x75, 0x70, 0xb0, 0x00, 0xfc, 0xcf, 0xc9, 0x56, 0xb1, 0xf5, 0xc5, 0xb4, 0x39,
0x90, 0x05, 0xa2, 0x9f, 0x30, 0xdd, 0x4c, 0x7b, 0x36, 0x02, 0x80, 0x59, 0xf3, 0x2c, 0xf9, 0x25,
0xf1, 0xea, 0x8a, 0x44, 0x23, 0x9c, 0xa1, 0x89, 0x58, 0x8c, 0x3b, 0x0d, 0x43, 0xb5, 0x03, 0xb9], dtype=np.uint8)

data = np.load("/(2025.04.26-2025.04.27) ACTF 2025/deeptx/5逆向layer3剩余部分/enc_layer2.npy")
dec = np.array([0]*65536).reshape(256*256)
enc = np.array([0]*65536).reshape(256*256)
print(data)

for block in range(256):
for thread in range(256):
dec[invs[thread] * 256 + invs[block]] = data[block * 256 + thread]

print(dec)
dec = np.array(dec, dtype=np.uint8)
np.save("enc1.npy", dec)

for block in range(256):
for thread in range(256):
enc[sbox[thread] * 256 + sbox[block]] = dec[block * 256 + thread]

print(enc)

得到仅layer1加密的flag,查看图像:

发现只剩下因卷积产生的动态模糊效果了,说明layer2~3的逆向还原完全正确,layer1是一个动态模糊,由于模糊核大部分都是0,考虑用AI恢复字符,尝试多次得到一个比较清晰的图片:

编写脚本爆破flag:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import hashlib
from itertools import product

target_hash = "fe6c63ce926ab3ed8aba9f52a3a3b8b5ef34d00c89708fba719c4cd13d0a9d73"

possible_chars = [
list("D"),
list("e"),
list("E"),
list("p"),
list("acemnorstuvwxz"),
["C"],
["U"],
["d"],
["A"],
list("RDUru0Oo"),
list("Ii71lL"),
["Y", "V", "v", "y"],
list("0OoCGQa"),
["u", "U"],
["7", "T","Z"],
]

prefix = "ACTF{"
suffix = "}"

def check_flags(possible_chars, prefix, suffix, target_hash):
for combination in product(*possible_chars):
flag = prefix + "".join(combination) + suffix
print(flag)
hash_value = hashlib.sha256(flag.encode()).hexdigest()
if hash_value == target_hash:
print(f"Found valid flag: {flag}")
return flag
print("No valid flag found.")
return None

valid_flag = check_flags(possible_chars, prefix, suffix, target_hash)

爆破得到flag:ACTF{DeEptCUdAR1VQUZ}.

闲话

deeptx真难逆😭