FFmpeg  4.3.8
hevcpred_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavcodec/hevcdec.h"
23 #include "hevcpred_mips.h"
24 
25 static const int8_t intra_pred_angle_up[17] = {
26  -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32
27 };
28 
29 static const int8_t intra_pred_angle_low[16] = {
30  32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26
31 };
32 
33 #define HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, \
34  mul_val_h0, mul_val_h1, mul_val_h2, mul_val_h3, \
35  res0, res1, mul_val_b0, mul_val_b1, round) \
36 { \
37  v8i16 res0_m, res1_m, res2_m, res3_m; \
38  \
39  MUL4(mul_val_h0, vec0, mul_val_h2, vec0, mul_val_h0, vec1, \
40  mul_val_h2, vec1, res0_m, res1_m, res2_m, res3_m); \
41  \
42  res0_m += mul_val_h1 * tmp0; \
43  res1_m += mul_val_h3 * tmp0; \
44  res2_m += mul_val_h1 * tmp0; \
45  res3_m += mul_val_h3 * tmp0; \
46  \
47  res0_m += mul_val_b0 * src0_r; \
48  res1_m += mul_val_b0 * src0_l; \
49  res2_m += (mul_val_b0 - 1) * src0_r; \
50  res3_m += (mul_val_b0 - 1) * src0_l; \
51  \
52  res0_m += mul_val_b1 * tmp1; \
53  res1_m += mul_val_b1 * tmp1; \
54  res2_m += (mul_val_b1 + 1) * tmp1; \
55  res3_m += (mul_val_b1 + 1) * tmp1; \
56  \
57  SRARI_H4_SH(res0_m, res1_m, res2_m, res3_m, round); \
58  PCKEV_B2_SH(res1_m, res0_m, res3_m, res2_m, res0, res1); \
59 }
60 
61 static void hevc_intra_pred_vert_4x4_msa(const uint8_t *src_top,
62  const uint8_t *src_left,
63  uint8_t *dst, int32_t stride,
64  int32_t flag)
65 {
66  uint32_t col;
67  uint32_t src_data;
68  v8i16 vec0, vec1, vec2;
69  v16i8 zero = { 0 };
70 
71  src_data = LW(src_top);
72  SW4(src_data, src_data, src_data, src_data, dst, stride);
73 
74  if (0 == flag) {
75  src_data = LW(src_left);
76 
77  vec2 = (v8i16) __msa_insert_w((v4i32) vec2, 0, src_data);
78 
79  vec0 = __msa_fill_h(src_left[-1]);
80  vec1 = __msa_fill_h(src_top[0]);
81 
82  vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
83  vec2 -= vec0;
84  vec2 >>= 1;
85  vec2 += vec1;
86  CLIP_SH_0_255(vec2);
87 
88  for (col = 0; col < 4; col++) {
89  dst[stride * col] = (uint8_t) vec2[col];
90  }
91  }
92 }
93 
94 static void hevc_intra_pred_vert_8x8_msa(const uint8_t *src_top,
95  const uint8_t *src_left,
96  uint8_t *dst, int32_t stride,
97  int32_t flag)
98 {
99  uint8_t *tmp_dst = dst;
100  uint32_t row;
101  uint16_t val0, val1, val2, val3;
102  uint64_t src_data1;
103  v8i16 vec0, vec1, vec2;
104  v16i8 zero = { 0 };
105 
106  src_data1 = LD(src_top);
107 
108  for (row = 8; row--;) {
109  SD(src_data1, tmp_dst);
110  tmp_dst += stride;
111  }
112 
113  if (0 == flag) {
114  src_data1 = LD(src_left);
115 
116  vec2 = (v8i16) __msa_insert_d((v2i64) zero, 0, src_data1);
117 
118  vec0 = __msa_fill_h(src_left[-1]);
119  vec1 = __msa_fill_h(src_top[0]);
120 
121  vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
122  vec2 -= vec0;
123  vec2 >>= 1;
124  vec2 += vec1;
125  CLIP_SH_0_255(vec2);
126 
127  val0 = vec2[0];
128  val1 = vec2[1];
129  val2 = vec2[2];
130  val3 = vec2[3];
131 
132  dst[0] = val0;
133  dst[stride] = val1;
134  dst[2 * stride] = val2;
135  dst[3 * stride] = val3;
136 
137  val0 = vec2[4];
138  val1 = vec2[5];
139  val2 = vec2[6];
140  val3 = vec2[7];
141 
142  dst[4 * stride] = val0;
143  dst[5 * stride] = val1;
144  dst[6 * stride] = val2;
145  dst[7 * stride] = val3;
146  }
147 }
148 
149 static void hevc_intra_pred_vert_16x16_msa(const uint8_t *src_top,
150  const uint8_t *src_left,
151  uint8_t *dst, int32_t stride,
152  int32_t flag)
153 {
154  int32_t col;
155  uint8_t *tmp_dst = dst;
156  uint32_t row;
157  v16u8 src;
158  v8i16 vec0, vec1, vec2, vec3;
159 
160  src = LD_UB(src_top);
161 
162  for (row = 16; row--;) {
163  ST_UB(src, tmp_dst);
164  tmp_dst += stride;
165  }
166 
167  if (0 == flag) {
168  src = LD_UB(src_left);
169 
170  vec0 = __msa_fill_h(src_left[-1]);
171  vec1 = __msa_fill_h(src_top[0]);
172 
173  UNPCK_UB_SH(src, vec2, vec3);
174  SUB2(vec2, vec0, vec3, vec0, vec2, vec3);
175 
176  vec2 >>= 1;
177  vec3 >>= 1;
178 
179  ADD2(vec2, vec1, vec3, vec1, vec2, vec3);
180  CLIP_SH2_0_255(vec2, vec3);
181 
182  src = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
183 
184  for (col = 0; col < 16; col++) {
185  dst[stride * col] = src[col];
186  }
187  }
188 }
189 
190 static void hevc_intra_pred_horiz_4x4_msa(const uint8_t *src_top,
191  const uint8_t *src_left,
192  uint8_t *dst, int32_t stride,
193  int32_t flag)
194 {
195  uint32_t val0, val1, val2, val3;
196  v16i8 src0;
197  v8i16 src0_r, src_top_val, src_left_val;
198  v16i8 zero = { 0 };
199 
200  val0 = src_left[0] * 0x01010101;
201  val1 = src_left[1] * 0x01010101;
202  val2 = src_left[2] * 0x01010101;
203  val3 = src_left[3] * 0x01010101;
204  SW4(val0, val1, val2, val3, dst, stride);
205 
206  if (0 == flag) {
207  val0 = LW(src_top);
208  src0 = (v16i8) __msa_insert_w((v4i32) src0, 0, val0);
209  src_top_val = __msa_fill_h(src_top[-1]);
210  src_left_val = __msa_fill_h(src_left[0]);
211 
212  src0_r = (v8i16) __msa_ilvr_b(zero, src0);
213 
214  src0_r -= src_top_val;
215  src0_r >>= 1;
216  src0_r += src_left_val;
217  CLIP_SH_0_255(src0_r);
218  src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
219  val0 = __msa_copy_s_w((v4i32) src0, 0);
220  SW(val0, dst);
221  }
222 }
223 
224 static void hevc_intra_pred_horiz_8x8_msa(const uint8_t *src_top,
225  const uint8_t *src_left,
226  uint8_t *dst, int32_t stride,
227  int32_t flag)
228 {
229  uint64_t val0, val1, val2, val3;
230  v16i8 src0;
231  v8i16 src0_r, src_top_val, src_left_val;
232  v16i8 zero = { 0 };
233 
234  val0 = src_left[0] * 0x0101010101010101;
235  val1 = src_left[1] * 0x0101010101010101;
236  val2 = src_left[2] * 0x0101010101010101;
237  val3 = src_left[3] * 0x0101010101010101;
238  SD4(val0, val1, val2, val3, dst, stride);
239 
240  val0 = src_left[4] * 0x0101010101010101;
241  val1 = src_left[5] * 0x0101010101010101;
242  val2 = src_left[6] * 0x0101010101010101;
243  val3 = src_left[7] * 0x0101010101010101;
244  SD4(val0, val1, val2, val3, dst + 4 * stride, stride);
245 
246  if (0 == flag) {
247  val0 = LD(src_top);
248  src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, val0);
249  src_top_val = __msa_fill_h(src_top[-1]);
250  src_left_val = __msa_fill_h(src_left[0]);
251 
252  src0_r = (v8i16) __msa_ilvr_b(zero, src0);
253 
254  src0_r -= src_top_val;
255  src0_r >>= 1;
256  src0_r += src_left_val;
257  CLIP_SH_0_255(src0_r);
258  src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
259  val0 = __msa_copy_s_d((v2i64) src0, 0);
260  SD(val0, dst);
261  }
262 }
263 
264 static void hevc_intra_pred_horiz_16x16_msa(const uint8_t *src_top,
265  const uint8_t *src_left,
266  uint8_t *dst, int32_t stride,
267  int32_t flag)
268 {
269  uint8_t *tmp_dst = dst;
270  uint32_t row;
271  uint8_t inp0, inp1, inp2, inp3;
272  v16i8 src0, src1, src2, src3;
273  v8i16 src0_r, src0_l, src_left_val, src_top_val;
274 
275  src_left_val = __msa_fill_h(src_left[0]);
276 
277  for (row = 4; row--;) {
278  inp0 = src_left[0];
279  inp1 = src_left[1];
280  inp2 = src_left[2];
281  inp3 = src_left[3];
282  src_left += 4;
283 
284  src0 = __msa_fill_b(inp0);
285  src1 = __msa_fill_b(inp1);
286  src2 = __msa_fill_b(inp2);
287  src3 = __msa_fill_b(inp3);
288 
289  ST_SB4(src0, src1, src2, src3, tmp_dst, stride);
290  tmp_dst += (4 * stride);
291  }
292 
293  if (0 == flag) {
294  src0 = LD_SB(src_top);
295  src_top_val = __msa_fill_h(src_top[-1]);
296 
297  UNPCK_UB_SH(src0, src0_r, src0_l);
298  SUB2(src0_r, src_top_val, src0_l, src_top_val, src0_r, src0_l);
299 
300  src0_r >>= 1;
301  src0_l >>= 1;
302 
303  ADD2(src0_r, src_left_val, src0_l, src_left_val, src0_r, src0_l);
304  CLIP_SH2_0_255(src0_r, src0_l);
305  src0 = __msa_pckev_b((v16i8) src0_l, (v16i8) src0_r);
306  ST_SB(src0, dst);
307  }
308 }
309 
310 static void hevc_intra_pred_horiz_32x32_msa(const uint8_t *src_top,
311  const uint8_t *src_left,
312  uint8_t *dst, int32_t stride)
313 {
314  uint32_t row;
315  uint8_t inp0, inp1, inp2, inp3;
316  v16i8 src0, src1, src2, src3;
317 
318  for (row = 0; row < 8; row++) {
319  inp0 = src_left[row * 4];
320  inp1 = src_left[row * 4 + 1];
321  inp2 = src_left[row * 4 + 2];
322  inp3 = src_left[row * 4 + 3];
323 
324  src0 = __msa_fill_b(inp0);
325  src1 = __msa_fill_b(inp1);
326  src2 = __msa_fill_b(inp2);
327  src3 = __msa_fill_b(inp3);
328 
329  ST_SB2(src0, src0, dst, 16);
330  dst += stride;
331  ST_SB2(src1, src1, dst, 16);
332  dst += stride;
333  ST_SB2(src2, src2, dst, 16);
334  dst += stride;
335  ST_SB2(src3, src3, dst, 16);
336  dst += stride;
337  }
338 }
339 
340 static void hevc_intra_pred_dc_4x4_msa(const uint8_t *src_top,
341  const uint8_t *src_left,
342  uint8_t *dst, int32_t stride,
343  int32_t flag)
344 {
345  uint8_t *tmp_dst = dst;
346  uint32_t addition = 0;
347  uint32_t val0, val1, val2;
348  v16i8 src = { 0 };
349  v16u8 store;
350  v16i8 zero = { 0 };
351  v8u16 sum, vec0, vec1;
352 
353  val0 = LW(src_top);
354  val1 = LW(src_left);
355  INSERT_W2_SB(val0, val1, src);
356  sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
357  sum = (v8u16) __msa_hadd_u_w(sum, sum);
358  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
359  sum = (v8u16) __msa_srari_w((v4i32) sum, 3);
360  addition = __msa_copy_u_w((v4i32) sum, 0);
361  store = (v16u8) __msa_fill_b(addition);
362  val0 = __msa_copy_u_w((v4i32) store, 0);
363  SW4(val0, val0, val0, val0, dst, stride)
364 
365  if (0 == flag) {
366  ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
367 
368  vec1 += vec0;
369  vec0 += vec0;
370  vec1 += vec0;
371 
372  vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
373  store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
374  val1 = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
375  store = (v16u8) __msa_insert_b((v16i8) store, 0, val1);
376  val0 = __msa_copy_u_w((v4i32) store, 0);
377  SW(val0, tmp_dst);
378 
379  val0 = src_left[1];
380  val1 = src_left[2];
381  val2 = src_left[3];
382 
383  addition *= 3;
384 
385  ADD2(val0, addition, val1, addition, val0, val1);
386  val2 += addition;
387 
388  val0 += 2;
389  val1 += 2;
390  val2 += 2;
391  val0 >>= 2;
392  val1 >>= 2;
393  val2 >>= 2;
394 
395  tmp_dst[stride * 1] = val0;
396  tmp_dst[stride * 2] = val1;
397  tmp_dst[stride * 3] = val2;
398  }
399 }
400 
401 static void hevc_intra_pred_dc_8x8_msa(const uint8_t *src_top,
402  const uint8_t *src_left,
403  uint8_t *dst, int32_t stride,
404  int32_t flag)
405 {
406  uint8_t *tmp_dst = dst;
407  uint32_t row, col, val;
408  uint32_t addition = 0;
409  uint64_t val0, val1;
410  v16u8 src = { 0 };
411  v16u8 store;
412  v8u16 sum, vec0, vec1;
413  v16i8 zero = { 0 };
414 
415  val0 = LD(src_top);
416  val1 = LD(src_left);
417  INSERT_D2_UB(val0, val1, src);
418  sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
419  sum = (v8u16) __msa_hadd_u_w(sum, sum);
420  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
421  sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
422  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
423  sum = (v8u16) __msa_srari_w((v4i32) sum, 4);
424  addition = __msa_copy_u_w((v4i32) sum, 0);
425  store = (v16u8) __msa_fill_b(addition);
426  val0 = __msa_copy_u_d((v2i64) store, 0);
427 
428  for (row = 8; row--;) {
429  SD(val0, dst);
430  dst += stride;
431  }
432 
433  if (0 == flag) {
434  ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
435 
436  vec1 += vec0;
437  vec0 += vec0;
438  vec1 += vec0;
439  vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
440  store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
441  val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
442  store = (v16u8) __msa_insert_b((v16i8) store, 0, val);
443  val0 = __msa_copy_u_d((v2i64) store, 0);
444  SD(val0, tmp_dst);
445 
446  val0 = LD(src_left);
447  src = (v16u8) __msa_insert_d((v2i64) src, 0, val0);
448  vec1 = (v8u16) __msa_ilvr_b(zero, (v16i8) src);
449  vec0 = (v8u16) __msa_fill_h(addition);
450  vec0 *= 3;
451  vec1 += vec0;
452  vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
453 
454  for (col = 1; col < 8; col++) {
455  tmp_dst[stride * col] = vec1[col];
456  }
457  }
458 }
459 
460 static void hevc_intra_pred_dc_16x16_msa(const uint8_t *src_top,
461  const uint8_t *src_left,
462  uint8_t *dst, int32_t stride,
463  int32_t flag)
464 {
465  uint8_t *tmp_dst = dst;
466  uint32_t row, col, val;
467  uint32_t addition = 0;
468  v16u8 src_above1, store, src_left1;
469  v8u16 sum, sum_above, sum_left;
470  v8u16 vec0, vec1, vec2;
471  v16i8 zero = { 0 };
472 
473  src_above1 = LD_UB(src_top);
474  src_left1 = LD_UB(src_left);
475 
476  HADD_UB2_UH(src_above1, src_left1, sum_above, sum_left);
477  sum = sum_above + sum_left;
478  sum = (v8u16) __msa_hadd_u_w(sum, sum);
479  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
480  sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
481  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
482  sum = (v8u16) __msa_srari_w((v4i32) sum, 5);
483  addition = __msa_copy_u_w((v4i32) sum, 0);
484  store = (v16u8) __msa_fill_b(addition);
485 
486  for (row = 16; row--;) {
487  ST_UB(store, dst);
488  dst += stride;
489  }
490 
491  if (0 == flag) {
492  vec0 = (v8u16) __msa_ilvr_b(zero, (v16i8) store);
493  ILVRL_B2_UH(zero, src_above1, vec1, vec2);
494  ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
495  vec0 += vec0;
496  ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
497  SRARI_H2_UH(vec1, vec2, 2);
498  store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
499  val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
500  store = (v16u8) __msa_insert_b((v16i8) store, 0, val);
501  ST_UB(store, tmp_dst);
502 
503  ILVRL_B2_UH(zero, src_left1, vec1, vec2);
504  vec0 = (v8u16) __msa_fill_h(addition);
505  vec0 *= 3;
506  ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
507  SRARI_H2_UH(vec1, vec2, 2);
508  store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
509 
510  for (col = 1; col < 16; col++) {
511  tmp_dst[stride * col] = store[col];
512  }
513  }
514 }
515 
516 static void hevc_intra_pred_dc_32x32_msa(const uint8_t *src_top,
517  const uint8_t *src_left,
518  uint8_t *dst, int32_t stride)
519 {
520  uint32_t row;
521  v16u8 src_above1, src_above2, store, src_left1, src_left2;
522  v8u16 sum_above1, sum_above2;
523  v8u16 sum_left1, sum_left2;
524  v8u16 sum, sum_above, sum_left;
525 
526  LD_UB2(src_top, 16, src_above1, src_above2);
527  LD_UB2(src_left, 16, src_left1, src_left2);
528  HADD_UB2_UH(src_above1, src_above2, sum_above1, sum_above2);
529  HADD_UB2_UH(src_left1, src_left2, sum_left1, sum_left2);
530  sum_above = sum_above1 + sum_above2;
531  sum_left = sum_left1 + sum_left2;
532  sum = sum_above + sum_left;
533  sum = (v8u16) __msa_hadd_u_w(sum, sum);
534  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
535  sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
536  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
537  sum = (v8u16) __msa_srari_w((v4i32) sum, 6);
538  store = (v16u8) __msa_splati_b((v16i8) sum, 0);
539 
540  for (row = 16; row--;) {
541  ST_UB2(store, store, dst, 16);
542  dst += stride;
543  ST_UB2(store, store, dst, 16);
544  dst += stride;
545  }
546 }
547 
548 static void hevc_intra_pred_plane_4x4_msa(const uint8_t *src_top,
549  const uint8_t *src_left,
550  uint8_t *dst, int32_t stride)
551 {
552  uint32_t src0, src1;
553  v16i8 src_vec0, src_vec1;
554  v8i16 src_vec0_r, src1_r, tmp0, tmp1, mul_val1;
555  v8i16 vec0, vec1, vec2, vec3, res0, res1, res2, res3;
556  v8i16 mul_val0 = { 3, 2, 1, 0, 1, 2, 3, 4 };
557  v16i8 zero = { 0 };
558 
559  src0 = LW(src_top);
560  src1 = LW(src_left);
561 
562  mul_val1 = (v8i16) __msa_pckod_d((v2i64) mul_val0, (v2i64) mul_val0);
563 
564  src_vec0 = (v16i8) __msa_insert_w((v4i32) zero, 0, src0);
565  src_vec1 = (v16i8) __msa_insert_w((v4i32) zero, 0, src1);
566 
567  ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src1_r);
568  SPLATI_H4_SH(src1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
569 
570  tmp0 = __msa_fill_h(src_top[4]);
571  tmp1 = __msa_fill_h(src_left[4]);
572 
573  MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
574  res0, res1, res2, res3);
575 
576  res0 += mul_val1 * tmp0;
577  res1 += mul_val1 * tmp0;
578  res2 += mul_val1 * tmp0;
579  res3 += mul_val1 * tmp0;
580 
581  res0 += 3 * src_vec0_r;
582  res1 += 2 * src_vec0_r;
583  res2 += src_vec0_r;
584  res0 += tmp1;
585  res1 += 2 * tmp1;
586  res2 += 3 * tmp1;
587  res3 += 4 * tmp1;
588 
589  PCKEV_D2_SH(res1, res0, res3, res2, res0, res1);
590  SRARI_H2_SH(res0, res1, 3);
591  src_vec0 = __msa_pckev_b((v16i8) res1, (v16i8) res0);
592  ST_W4(src_vec0, 0, 1, 2, 3, dst, stride);
593 }
594 
595 static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top,
596  const uint8_t *src_left,
597  uint8_t *dst, int32_t stride)
598 {
599  uint64_t src0, src1;
600  v16i8 src_vec0, src_vec1, src_vec2, src_vec3;
601  v8i16 src_vec0_r, src_vec1_r;
602  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
603  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
604  v8i16 tmp0, tmp1, tmp2;
605  v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
606  v8i16 mul_val0 = { 7, 6, 5, 4, 3, 2, 1, 0 };
607  v16i8 zero = { 0 };
608 
609  src0 = LD(src_top);
610  src1 = LD(src_left);
611 
612  src_vec0 = (v16i8) __msa_insert_d((v2i64) zero, 0, src0);
613  src_vec1 = (v16i8) __msa_insert_d((v2i64) zero, 0, src1);
614 
615  ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src_vec1_r);
616  SPLATI_H4_SH(src_vec1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
617  SPLATI_H4_SH(src_vec1_r, 4, 5, 6, 7, vec4, vec5, vec6, vec7);
618 
619  tmp0 = __msa_fill_h(src_top[8]);
620  tmp1 = __msa_fill_h(src_left[8]);
621 
622  MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
623  res0, res1, res2, res3);
624  MUL4(mul_val0, vec4, mul_val0, vec5, mul_val0, vec6, mul_val0, vec7,
625  res4, res5, res6, res7);
626 
627  tmp2 = mul_val1 * tmp0;
628  res0 += tmp2;
629  res1 += tmp2;
630  res2 += tmp2;
631  res3 += tmp2;
632  res4 += tmp2;
633  res5 += tmp2;
634  res6 += tmp2;
635  res7 += tmp2;
636 
637  res0 += 7 * src_vec0_r;
638  res1 += 6 * src_vec0_r;
639  res2 += 5 * src_vec0_r;
640  res3 += 4 * src_vec0_r;
641  res4 += 3 * src_vec0_r;
642  res5 += 2 * src_vec0_r;
643  res6 += src_vec0_r;
644 
645  res0 += tmp1;
646  res1 += 2 * tmp1;
647  res2 += 3 * tmp1;
648  res3 += 4 * tmp1;
649  res4 += 5 * tmp1;
650  res5 += 6 * tmp1;
651  res6 += 7 * tmp1;
652  res7 += 8 * tmp1;
653 
654  SRARI_H4_SH(res0, res1, res2, res3, 4);
655  SRARI_H4_SH(res4, res5, res6, res7, 4);
656  PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
657  src_vec0, src_vec1, src_vec2, src_vec3);
658 
659  ST_D8(src_vec0, src_vec1, src_vec2, src_vec3, 0, 1, 0, 1,
660  0, 1, 0, 1, dst, stride);
661 }
662 
663 static void hevc_intra_pred_plane_16x16_msa(const uint8_t *src_top,
664  const uint8_t *src_left,
665  uint8_t *dst, int32_t stride)
666 {
667  v16u8 src0, src1;
668  v8i16 src0_r, src1_r, src0_l, src1_l;
669  v8i16 vec0, vec1;
670  v8i16 res0, res1, tmp0, tmp1;
671  v8i16 mul_val2, mul_val3;
672  v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
673  v8i16 mul_val0 = { 15, 14, 13, 12, 11, 10, 9, 8 };
674 
675  src0 = LD_UB(src_top);
676  src1 = LD_UB(src_left);
677 
678  UNPCK_UB_SH(src0, src0_r, src0_l);
679  UNPCK_UB_SH(src1, src1_r, src1_l);
680 
681  mul_val2 = mul_val0 - 8;
682  mul_val3 = mul_val1 + 8;
683 
684  tmp0 = __msa_fill_h(src_top[16]);
685  tmp1 = __msa_fill_h(src_left[16]);
686 
687  SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
688  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
689  mul_val0, mul_val1, mul_val2, mul_val3,
690  res0, res1, 15, 1, 5);
691  ST_SH2(res0, res1, dst, stride);
692  dst += (2 * stride);
693 
694  SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
695  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
696  mul_val0, mul_val1, mul_val2, mul_val3,
697  res0, res1, 13, 3, 5);
698  ST_SH2(res0, res1, dst, stride);
699  dst += (2 * stride);
700 
701  SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
702  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
703  mul_val0, mul_val1, mul_val2, mul_val3,
704  res0, res1, 11, 5, 5);
705  ST_SH2(res0, res1, dst, stride);
706  dst += (2 * stride);
707 
708  SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
709  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
710  mul_val0, mul_val1, mul_val2, mul_val3,
711  res0, res1, 9, 7, 5);
712  ST_SH2(res0, res1, dst, stride);
713  dst += (2 * stride);
714 
715  SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
716  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
717  mul_val0, mul_val1, mul_val2, mul_val3,
718  res0, res1, 7, 9, 5);
719  ST_SH2(res0, res1, dst, stride);
720  dst += (2 * stride);
721 
722  SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
723  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
724  mul_val0, mul_val1, mul_val2, mul_val3,
725  res0, res1, 5, 11, 5);
726  ST_SH2(res0, res1, dst, stride);
727  dst += (2 * stride);
728 
729  SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
730  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
731  mul_val0, mul_val1, mul_val2, mul_val3,
732  res0, res1, 3, 13, 5);
733  ST_SH2(res0, res1, dst, stride);
734  dst += (2 * stride);
735 
736  SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
737  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
738  mul_val0, mul_val1, mul_val2, mul_val3,
739  res0, res1, 1, 15, 5);
740  ST_SH2(res0, res1, dst, stride);
741 }
742 
743 static void process_intra_upper_16x16_msa(const uint8_t *src_top,
744  const uint8_t *src_left,
745  uint8_t *dst, int32_t stride,
746  uint8_t offset)
747 {
748  v16i8 src0, src1;
749  v8i16 src0_r, src1_r, src0_l, src1_l;
750  v8i16 vec0, vec1, res0, res1;
751  v8i16 tmp0, tmp1;
752  v8i16 mul_val2, mul_val3;
753  v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
754  v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
755 
756  tmp0 = __msa_fill_h(src_top[32 - offset]);
757  tmp1 = __msa_fill_h(src_left[32]);
758 
759  src0 = LD_SB(src_top);
760  src1 = LD_SB(src_left);
761 
762  UNPCK_UB_SH(src0, src0_r, src0_l);
763  UNPCK_UB_SH(src1, src1_r, src1_l);
764 
765  mul_val1 += offset;
766  mul_val0 -= offset;
767  mul_val2 = mul_val0 - 8;
768  mul_val3 = mul_val1 + 8;
769 
770  SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
771  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
772  mul_val0, mul_val1, mul_val2, mul_val3,
773  res0, res1, 31, 1, 6);
774  ST_SH2(res0, res1, dst, stride);
775  dst += (2 * stride);
776 
777  SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
778  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
779  mul_val0, mul_val1, mul_val2, mul_val3,
780  res0, res1, 29, 3, 6);
781  ST_SH2(res0, res1, dst, stride);
782  dst += (2 * stride);
783 
784  SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
785  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
786  mul_val0, mul_val1, mul_val2, mul_val3,
787  res0, res1, 27, 5, 6);
788  ST_SH2(res0, res1, dst, stride);
789  dst += (2 * stride);
790 
791  SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
792  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
793  mul_val0, mul_val1, mul_val2, mul_val3,
794  res0, res1, 25, 7, 6);
795  ST_SH2(res0, res1, dst, stride);
796  dst += (2 * stride);
797 
798  SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
799  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
800  mul_val0, mul_val1, mul_val2, mul_val3,
801  res0, res1, 23, 9, 6);
802  ST_SH2(res0, res1, dst, stride);
803  dst += (2 * stride);
804 
805  SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
806  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
807  mul_val0, mul_val1, mul_val2, mul_val3,
808  res0, res1, 21, 11, 6);
809  ST_SH2(res0, res1, dst, stride);
810  dst += (2 * stride);
811 
812  SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
813  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
814  mul_val0, mul_val1, mul_val2, mul_val3,
815  res0, res1, 19, 13, 6);
816  ST_SH2(res0, res1, dst, stride);
817  dst += (2 * stride);
818 
819  SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
820  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
821  mul_val0, mul_val1, mul_val2, mul_val3,
822  res0, res1, 17, 15, 6);
823  ST_SH2(res0, res1, dst, stride);
824 }
825 
826 static void process_intra_lower_16x16_msa(const uint8_t *src_top,
827  const uint8_t *src_left,
828  uint8_t *dst, int32_t stride,
829  uint8_t offset)
830 {
831  v16i8 src0, src1;
832  v8i16 src0_r, src1_r, src0_l, src1_l;
833  v8i16 vec0, vec1, res0, res1, tmp0, tmp1;
834  v8i16 mul_val2, mul_val3;
835  v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
836  v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
837 
838  tmp0 = __msa_fill_h(src_top[32 - offset]);
839  tmp1 = __msa_fill_h(src_left[16]);
840 
841  src0 = LD_SB(src_top);
842  src1 = LD_SB(src_left);
843 
844  UNPCK_UB_SH(src0, src0_r, src0_l);
845  UNPCK_UB_SH(src1, src1_r, src1_l);
846 
847  mul_val1 += offset;
848  mul_val0 -= offset;
849  mul_val2 = mul_val0 - 8;
850  mul_val3 = mul_val1 + 8;
851 
852  SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
853  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
854  mul_val0, mul_val1, mul_val2, mul_val3,
855  res0, res1, 15, 17, 6);
856  ST_SH2(res0, res1, dst, stride);
857  dst += (2 * stride);
858 
859  SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
860  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
861  mul_val0, mul_val1, mul_val2, mul_val3,
862  res0, res1, 13, 19, 6);
863  ST_SH2(res0, res1, dst, stride);
864  dst += (2 * stride);
865 
866  SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
867  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
868  mul_val0, mul_val1, mul_val2, mul_val3,
869  res0, res1, 11, 21, 6);
870  ST_SH2(res0, res1, dst, stride);
871  dst += (2 * stride);
872 
873  SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
874  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
875  mul_val0, mul_val1, mul_val2, mul_val3,
876  res0, res1, 9, 23, 6);
877  ST_SH2(res0, res1, dst, stride);
878  dst += (2 * stride);
879 
880  SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
881  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
882  mul_val0, mul_val1, mul_val2, mul_val3,
883  res0, res1, 7, 25, 6);
884  ST_SH2(res0, res1, dst, stride);
885  dst += (2 * stride);
886 
887  SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
888  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
889  mul_val0, mul_val1, mul_val2, mul_val3,
890  res0, res1, 5, 27, 6);
891  ST_SH2(res0, res1, dst, stride);
892  dst += (2 * stride);
893 
894  SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
895  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
896  mul_val0, mul_val1, mul_val2, mul_val3,
897  res0, res1, 3, 29, 6);
898  ST_SH2(res0, res1, dst, stride);
899  dst += (2 * stride);
900 
901  SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
902  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
903  mul_val0, mul_val1, mul_val2, mul_val3,
904  res0, res1, 1, 31, 6);
905  ST_SH2(res0, res1, dst, stride);
906 }
907 
908 static void hevc_intra_pred_plane_32x32_msa(const uint8_t *src_top,
909  const uint8_t *src_left,
910  uint8_t *dst, int32_t stride)
911 {
912  process_intra_upper_16x16_msa(src_top, src_left, dst, stride, 0);
913  process_intra_upper_16x16_msa((src_top + 16), src_left,
914  (dst + 16), stride, 16);
915  dst += (16 * stride);
916  src_left += 16;
917 
918  process_intra_lower_16x16_msa(src_top, src_left, dst, stride, 0);
919  process_intra_lower_16x16_msa((src_top + 16), src_left,
920  (dst + 16), stride, 16);
921 }
922 
924  const uint8_t *src_left,
925  uint8_t *dst,
926  int32_t stride,
927  int32_t mode)
928 {
929  int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
930  uint8_t ref_array[3 * 32 + 4];
931  uint8_t *ref_tmp = ref_array + 4;
932  const uint8_t *ref;
933  int32_t last;
934  int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
935  int32_t idx2, fact_val2, idx3, fact_val3;
936  int32_t angle, angle_loop;
937  int32_t inv_angle_val, offset;
938  uint64_t tmp0;
939  v16i8 top0, top1, top2, top3;
940  v16i8 dst_val0;
941  v16i8 zero = { 0 };
942  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
943  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
944 
945  angle = intra_pred_angle_up[mode - 18];
946  inv_angle_val = inv_angle[mode - 18];
947  last = (angle) >> 3;
948  angle_loop = angle;
949 
950  ref = src_top - 1;
951  if (angle < 0 && last < -1) {
952  inv_angle_val = inv_angle[mode - 18];
953 
954  tmp0 = LD(ref);
955  SD(tmp0, ref_tmp);
956 
957  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
958  offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
959  ref_tmp[h_cnt] = src_left[offset];
960  }
961 
962  ref = ref_tmp;
963  }
964 
965  idx0 = angle_loop >> 5;
966  fact_val0 = angle_loop & 31;
967  angle_loop += angle;
968 
969  idx1 = angle_loop >> 5;
970  fact_val1 = angle_loop & 31;
971  angle_loop += angle;
972 
973  idx2 = angle_loop >> 5;
974  fact_val2 = angle_loop & 31;
975  angle_loop += angle;
976 
977  idx3 = angle_loop >> 5;
978  fact_val3 = angle_loop & 31;
979 
980  top0 = LD_SB(ref + idx0 + 1);
981  top1 = LD_SB(ref + idx1 + 1);
982  top2 = LD_SB(ref + idx2 + 1);
983  top3 = LD_SB(ref + idx3 + 1);
984 
985  fact0 = __msa_fill_h(fact_val0);
986  fact1 = __msa_fill_h(32 - fact_val0);
987 
988  fact2 = __msa_fill_h(fact_val1);
989  fact3 = __msa_fill_h(32 - fact_val1);
990 
991  fact4 = __msa_fill_h(fact_val2);
992  fact5 = __msa_fill_h(32 - fact_val2);
993 
994  fact6 = __msa_fill_h(fact_val3);
995  fact7 = __msa_fill_h(32 - fact_val3);
996 
997  ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
998  ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
999  ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
1000  diff0, diff2, diff4, diff6);
1001  SLDI_B4_SH(zero, diff0, zero, diff2, zero, diff4, zero, diff6, 2,
1002  diff1, diff3, diff5, diff7);
1003  ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
1004  ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
1005  MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
1006 
1007  diff1 += diff0 * fact1;
1008  diff3 += diff2 * fact3;
1009 
1010  SRARI_H2_SH(diff1, diff3, 5);
1011  dst_val0 = __msa_pckev_b((v16i8) diff3, (v16i8) diff1);
1012  ST_W4(dst_val0, 0, 1, 2, 3, dst, stride);
1013 }
1014 
1016  const uint8_t *src_left,
1017  uint8_t *dst,
1018  int32_t stride,
1019  int32_t mode)
1020 {
1021  int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1022  uint8_t ref_array[3 * 32 + 4];
1023  uint8_t *ref_tmp = ref_array + 8;
1024  const uint8_t *ref;
1025  const uint8_t *src_left_tmp = src_left - 1;
1026  int32_t last, offset;
1027  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1028  int32_t idx2, fact_val2, idx3, fact_val3;
1029  int32_t angle, angle_loop;
1030  int32_t inv_angle_val, inv_angle_val_loop;
1031  int32_t tmp0, tmp1, tmp2;
1032  v16i8 top0, top1, top2, top3;
1033  v16u8 dst_val0, dst_val1;
1034  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1035  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1036 
1037  angle = intra_pred_angle_up[mode - 18];
1038  inv_angle_val = inv_angle[mode - 18];
1039  last = (angle) >> 2;
1040  angle_loop = angle;
1041 
1042  ref = src_top - 1;
1043  if (last < -1) {
1044  inv_angle_val_loop = inv_angle_val * last;
1045 
1046  tmp0 = LW(ref);
1047  tmp1 = LW(ref + 4);
1048  tmp2 = LW(ref + 8);
1049  SW(tmp0, ref_tmp);
1050  SW(tmp1, ref_tmp + 4);
1051  SW(tmp2, ref_tmp + 8);
1052 
1053  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1054  offset = (inv_angle_val_loop + 128) >> 8;
1055  ref_tmp[h_cnt] = src_left_tmp[offset];
1056  inv_angle_val_loop += inv_angle_val;
1057  }
1058  ref = ref_tmp;
1059  }
1060 
1061  for (v_cnt = 0; v_cnt < 2; v_cnt++) {
1062  idx0 = (angle_loop) >> 5;
1063  fact_val0 = (angle_loop) & 31;
1064  angle_loop += angle;
1065 
1066  idx1 = (angle_loop) >> 5;
1067  fact_val1 = (angle_loop) & 31;
1068  angle_loop += angle;
1069 
1070  idx2 = (angle_loop) >> 5;
1071  fact_val2 = (angle_loop) & 31;
1072  angle_loop += angle;
1073 
1074  idx3 = (angle_loop) >> 5;
1075  fact_val3 = (angle_loop) & 31;
1076  angle_loop += angle;
1077 
1078  top0 = LD_SB(ref + idx0 + 1);
1079  top1 = LD_SB(ref + idx1 + 1);
1080  top2 = LD_SB(ref + idx2 + 1);
1081  top3 = LD_SB(ref + idx3 + 1);
1082 
1083  fact0 = __msa_fill_h(fact_val0);
1084  fact1 = __msa_fill_h(32 - fact_val0);
1085  fact2 = __msa_fill_h(fact_val1);
1086  fact3 = __msa_fill_h(32 - fact_val1);
1087  fact4 = __msa_fill_h(fact_val2);
1088  fact5 = __msa_fill_h(32 - fact_val2);
1089  fact6 = __msa_fill_h(fact_val3);
1090  fact7 = __msa_fill_h(32 - fact_val3);
1091 
1092  UNPCK_UB_SH(top0, diff0, diff1);
1093  UNPCK_UB_SH(top1, diff2, diff3);
1094  UNPCK_UB_SH(top2, diff4, diff5);
1095  UNPCK_UB_SH(top3, diff6, diff7);
1096 
1097  SLDI_B4_SH(diff1, diff0, diff3, diff2, diff5, diff4, diff7, diff6, 2,
1098  diff1, diff3, diff5, diff7);
1099  MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
1100  diff1, diff3, diff5, diff7);
1101 
1102  diff1 += diff0 * fact1;
1103  diff3 += diff2 * fact3;
1104  diff5 += diff4 * fact5;
1105  diff7 += diff6 * fact7;
1106 
1107  SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
1108  PCKEV_B2_UB(diff3, diff1, diff7, diff5, dst_val0, dst_val1);
1109  ST_D4(dst_val0, dst_val1, 0, 1, 0, 1, dst, stride);
1110  dst += (4 * stride);
1111  }
1112 }
1113 
1115  const uint8_t *src_left,
1116  uint8_t *dst,
1117  int32_t stride,
1118  int32_t mode)
1119 {
1120  int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1121  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1122  int32_t idx2, fact_val2, idx3, fact_val3;
1123  int32_t tmp0;
1124  int32_t angle, angle_loop, offset;
1125  int32_t inv_angle_val, inv_angle_val_loop;
1126  uint8_t ref_array[3 * 32 + 4];
1127  uint8_t *ref_tmp = ref_array + 16;
1128  const uint8_t *ref;
1129  const uint8_t *src_left_tmp = src_left - 1;
1130  int32_t last;
1131  v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
1132  v16i8 dst0, dst1, dst2, dst3;
1133  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1134  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1135  v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1136 
1137  angle = intra_pred_angle_up[mode - 18];
1138  inv_angle_val = inv_angle[mode - 18];
1139  last = angle >> 1;
1140  angle_loop = angle;
1141 
1142  ref = src_top - 1;
1143  if (last < -1) {
1144  inv_angle_val_loop = inv_angle_val * last;
1145 
1146  top0 = LD_UB(ref);
1147  tmp0 = LW(ref + 16);
1148  ST_UB(top0, ref_tmp);
1149  SW(tmp0, ref_tmp + 16);
1150 
1151  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1152  offset = (inv_angle_val_loop + 128) >> 8;
1153  ref_tmp[h_cnt] = src_left_tmp[offset];
1154  inv_angle_val_loop += inv_angle_val;
1155  }
1156  ref = ref_tmp;
1157  }
1158 
1159  for (v_cnt = 4; v_cnt--;) {
1160  idx0 = (angle_loop) >> 5;
1161  fact_val0 = (angle_loop) & 31;
1162  angle_loop += angle;
1163 
1164  idx1 = (angle_loop) >> 5;
1165  fact_val1 = (angle_loop) & 31;
1166  angle_loop += angle;
1167 
1168  idx2 = (angle_loop) >> 5;
1169  fact_val2 = (angle_loop) & 31;
1170  angle_loop += angle;
1171 
1172  idx3 = (angle_loop) >> 5;
1173  fact_val3 = (angle_loop) & 31;
1174  angle_loop += angle;
1175 
1176  LD_UB2(ref + idx0 + 1, 16, top0, top1);
1177  LD_UB2(ref + idx1 + 1, 16, top2, top3);
1178  LD_UB2(ref + idx2 + 1, 16, top4, top5);
1179  LD_UB2(ref + idx3 + 1, 16, top6, top7);
1180 
1181  fact0 = __msa_fill_h(fact_val0);
1182  fact1 = __msa_fill_h(32 - fact_val0);
1183  fact2 = __msa_fill_h(fact_val1);
1184  fact3 = __msa_fill_h(32 - fact_val1);
1185  fact4 = __msa_fill_h(fact_val2);
1186  fact5 = __msa_fill_h(32 - fact_val2);
1187  fact6 = __msa_fill_h(fact_val3);
1188  fact7 = __msa_fill_h(32 - fact_val3);
1189 
1190  SLDI_B4_UB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1191  top1, top3, top5, top7);
1192  UNPCK_UB_SH(top0, diff0, diff1);
1193  UNPCK_UB_SH(top1, diff2, diff3);
1194  UNPCK_UB_SH(top2, diff4, diff5);
1195  UNPCK_UB_SH(top3, diff6, diff7);
1196  UNPCK_UB_SH(top4, diff8, diff9);
1197  UNPCK_UB_SH(top5, diff10, diff11);
1198  UNPCK_UB_SH(top6, diff12, diff13);
1199  UNPCK_UB_SH(top7, diff14, diff15);
1200 
1201  MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
1202  diff2, diff3, diff6, diff7);
1203  MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
1204  diff10, diff11, diff14, diff15);
1205 
1206  diff2 += diff0 * fact1;
1207  diff3 += diff1 * fact1;
1208  diff6 += diff4 * fact3;
1209  diff7 += diff5 * fact3;
1210  diff10 += diff8 * fact5;
1211  diff11 += diff9 * fact5;
1212  diff14 += diff12 * fact7;
1213  diff15 += diff13 * fact7;
1214 
1215  SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1216  SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1217  PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1218  dst0, dst1, dst2, dst3);
1219  ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
1220  dst += (4 * stride);
1221  }
1222 }
1223 
1225  const uint8_t *src_left,
1226  uint8_t *dst,
1227  int32_t stride,
1228  int32_t mode)
1229 {
1230  int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1231  uint8_t ref_array[3 * 32 + 4];
1232  uint8_t *ref_tmp;
1233  const uint8_t *ref;
1234  const uint8_t *src_left_tmp = src_left - 1;
1235  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1236  int32_t tmp0, tmp1, tmp2, tmp3;
1237  int32_t angle, angle_loop;
1238  int32_t inv_angle_val, inv_angle_val_loop;
1239  int32_t last, offset;
1240  v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
1241  v16i8 dst0, dst1, dst2, dst3;
1242  v8i16 fact0, fact1, fact2, fact3;
1243  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1244  v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1245 
1246  ref_tmp = ref_array + 32;
1247 
1248  angle = intra_pred_angle_up[mode - 18];
1249  inv_angle_val = inv_angle[mode - 18];
1250  last = angle;
1251  angle_loop = angle;
1252 
1253  ref = src_top - 1;
1254  if (last < -1) {
1255  inv_angle_val_loop = inv_angle_val * last;
1256  LD_UB2(ref, 16, top0, top1);
1257  tmp0 = ref[32];
1258  tmp1 = ref[33];
1259  tmp2 = ref[34];
1260  tmp3 = ref[35];
1261 
1262  ST_UB2(top0, top1, ref_tmp, 16);
1263  ref_tmp[32] = tmp0;
1264  ref_tmp[33] = tmp1;
1265  ref_tmp[34] = tmp2;
1266  ref_tmp[35] = tmp3;
1267 
1268  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1269  offset = (inv_angle_val_loop + 128) >> 8;
1270  ref_tmp[h_cnt] = src_left_tmp[offset];
1271  inv_angle_val_loop += inv_angle_val;
1272  }
1273 
1274  ref = ref_tmp;
1275  }
1276 
1277  for (v_cnt = 16; v_cnt--;) {
1278  idx0 = (angle_loop) >> 5;
1279  fact_val0 = (angle_loop) & 31;
1280  angle_loop += angle;
1281 
1282  idx1 = (angle_loop) >> 5;
1283  fact_val1 = (angle_loop) & 31;
1284  angle_loop += angle;
1285 
1286  top0 = LD_UB(ref + idx0 + 1);
1287  top4 = LD_UB(ref + idx1 + 1);
1288  top1 = LD_UB(ref + idx0 + 17);
1289  top5 = LD_UB(ref + idx1 + 17);
1290  top3 = LD_UB(ref + idx0 + 33);
1291  top7 = LD_UB(ref + idx1 + 33);
1292 
1293  fact0 = __msa_fill_h(fact_val0);
1294  fact1 = __msa_fill_h(32 - fact_val0);
1295  fact2 = __msa_fill_h(fact_val1);
1296  fact3 = __msa_fill_h(32 - fact_val1);
1297 
1298  top2 = top1;
1299  top6 = top5;
1300 
1301  SLDI_B4_UB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1302  top1, top3, top5, top7);
1303  UNPCK_UB_SH(top0, diff0, diff1);
1304  UNPCK_UB_SH(top1, diff2, diff3);
1305  UNPCK_UB_SH(top2, diff4, diff5);
1306  UNPCK_UB_SH(top3, diff6, diff7);
1307  UNPCK_UB_SH(top4, diff8, diff9);
1308  UNPCK_UB_SH(top5, diff10, diff11);
1309  UNPCK_UB_SH(top6, diff12, diff13);
1310  UNPCK_UB_SH(top7, diff14, diff15);
1311 
1312  MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
1313  diff2, diff3, diff6, diff7);
1314  MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
1315  diff10, diff11, diff14, diff15);
1316 
1317  diff2 += diff0 * fact1;
1318  diff3 += diff1 * fact1;
1319  diff6 += diff4 * fact1;
1320  diff7 += diff5 * fact1;
1321  diff10 += diff8 * fact3;
1322  diff11 += diff9 * fact3;
1323  diff14 += diff12 * fact3;
1324  diff15 += diff13 * fact3;
1325 
1326  SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1327  SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1328  PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1329  dst0, dst1, dst2, dst3);
1330 
1331  ST_SB2(dst0, dst1, dst, 16);
1332  dst += stride;
1333  ST_SB2(dst2, dst3, dst, 16);
1334  dst += stride;
1335  }
1336 }
1337 
1339  const uint8_t *src_left,
1340  uint8_t *dst,
1341  int32_t stride,
1342  int32_t mode)
1343 {
1344  int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1345  uint8_t ref_array[3 * 32 + 4];
1346  uint8_t *ref_tmp = ref_array + 4;
1347  const uint8_t *ref;
1348  int32_t last, offset;
1349  int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
1350  int32_t idx2, fact_val2, idx3, fact_val3;
1351  int32_t angle, angle_loop, inv_angle_val;
1352  uint64_t tmp0;
1353  v16i8 dst_val0, dst_val1;
1354  v16u8 top0, top1, top2, top3;
1355  v16u8 zero = { 0 };
1356  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1357  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1358 
1359  angle = intra_pred_angle_low[mode - 2];
1360  last = angle >> 3;
1361  angle_loop = angle;
1362 
1363  ref = src_left - 1;
1364  if (last < -1) {
1365  inv_angle_val = inv_angle[mode - 11];
1366 
1367  tmp0 = LD(ref);
1368  SD(tmp0, ref_tmp);
1369 
1370  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1371  offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
1372  ref_tmp[h_cnt] = src_top[offset];
1373  }
1374 
1375  ref = ref_tmp;
1376  }
1377 
1378  idx0 = angle_loop >> 5;
1379  fact_val0 = angle_loop & 31;
1380  angle_loop += angle;
1381 
1382  idx1 = angle_loop >> 5;
1383  fact_val1 = angle_loop & 31;
1384  angle_loop += angle;
1385 
1386  idx2 = angle_loop >> 5;
1387  fact_val2 = angle_loop & 31;
1388  angle_loop += angle;
1389 
1390  idx3 = angle_loop >> 5;
1391  fact_val3 = angle_loop & 31;
1392 
1393  top0 = LD_UB(ref + idx0 + 1);
1394  top1 = LD_UB(ref + idx1 + 1);
1395  top2 = LD_UB(ref + idx2 + 1);
1396  top3 = LD_UB(ref + idx3 + 1);
1397 
1398  fact0 = __msa_fill_h(fact_val0);
1399  fact1 = __msa_fill_h(32 - fact_val0);
1400  fact2 = __msa_fill_h(fact_val1);
1401  fact3 = __msa_fill_h(32 - fact_val1);
1402  fact4 = __msa_fill_h(fact_val2);
1403  fact5 = __msa_fill_h(32 - fact_val2);
1404  fact6 = __msa_fill_h(fact_val3);
1405  fact7 = __msa_fill_h(32 - fact_val3);
1406 
1407  ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
1408  ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
1409  ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
1410  diff0, diff2, diff4, diff6);
1411  SLDI_B4_SH(zero, diff0, zero, diff2, zero, diff4, zero, diff6, 2,
1412  diff1, diff3, diff5, diff7);
1413  ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
1414  ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
1415  MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
1416 
1417  diff1 += diff0 * fact1;
1418  diff3 += diff2 * fact3;
1419 
1420  SRARI_H2_SH(diff1, diff3, 5);
1421  PCKEV_B2_SB(diff1, diff1, diff3, diff3, dst_val0, dst_val1);
1422 
1423  diff0 = (v8i16) __msa_pckev_b(dst_val1, dst_val0);
1424  diff1 = (v8i16) __msa_pckod_b(dst_val1, dst_val0);
1425 
1426  diff2 = (v8i16) __msa_pckev_w((v4i32) diff1, (v4i32) diff0);
1427 
1428  dst_val0 = __msa_pckev_b((v16i8) diff2, (v16i8) diff2);
1429  dst_val1 = __msa_pckod_b((v16i8) diff2, (v16i8) diff2);
1430 
1431  ST_W2(dst_val0, 0, 1, dst, stride);
1432  ST_W2(dst_val1, 0, 1, dst + 2 * stride, stride);
1433 }
1434 
1436  const uint8_t *src_left,
1437  uint8_t *dst,
1438  int32_t stride,
1439  int32_t mode)
1440 {
1441  int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1442  uint8_t ref_array[3 * 32 + 4];
1443  uint8_t *ref_tmp = ref_array + 8;
1444  const uint8_t *ref;
1445  const uint8_t *src_top_tmp = src_top - 1;
1446  uint8_t *dst_org;
1447  int32_t last, offset, tmp0, tmp1, tmp2;
1448  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1449  int32_t idx2, fact_val2, idx3, fact_val3;
1450  int32_t angle, angle_loop, inv_angle_val;
1451  v16i8 top0, top1, top2, top3;
1452  v16i8 dst_val0, dst_val1, dst_val2, dst_val3;
1453  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1454  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1455 
1456  angle = intra_pred_angle_low[mode - 2];
1457  last = (angle) >> 2;
1458  angle_loop = angle;
1459 
1460  ref = src_left - 1;
1461  if (last < -1) {
1462  inv_angle_val = inv_angle[mode - 11];
1463 
1464  tmp0 = LW(ref);
1465  tmp1 = LW(ref + 4);
1466  tmp2 = LW(ref + 8);
1467  SW(tmp0, ref_tmp);
1468  SW(tmp1, ref_tmp + 4);
1469  SW(tmp2, ref_tmp + 8);
1470 
1471  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1472  offset = (h_cnt * inv_angle_val + 128) >> 8;
1473  ref_tmp[h_cnt] = src_top_tmp[offset];
1474  }
1475 
1476  ref = ref_tmp;
1477  }
1478 
1479  for (v_cnt = 0; v_cnt < 2; v_cnt++) {
1480  dst_org = dst;
1481 
1482  idx0 = angle_loop >> 5;
1483  fact_val0 = angle_loop & 31;
1484  angle_loop += angle;
1485 
1486  idx1 = angle_loop >> 5;
1487  fact_val1 = angle_loop & 31;
1488  angle_loop += angle;
1489 
1490  idx2 = angle_loop >> 5;
1491  fact_val2 = angle_loop & 31;
1492  angle_loop += angle;
1493 
1494  idx3 = angle_loop >> 5;
1495  fact_val3 = angle_loop & 31;
1496  angle_loop += angle;
1497 
1498  top0 = LD_SB(ref + idx0 + 1);
1499  top1 = LD_SB(ref + idx1 + 1);
1500  top2 = LD_SB(ref + idx2 + 1);
1501  top3 = LD_SB(ref + idx3 + 1);
1502 
1503  fact0 = __msa_fill_h(fact_val0);
1504  fact1 = __msa_fill_h(32 - fact_val0);
1505  fact2 = __msa_fill_h(fact_val1);
1506  fact3 = __msa_fill_h(32 - fact_val1);
1507  fact4 = __msa_fill_h(fact_val2);
1508  fact5 = __msa_fill_h(32 - fact_val2);
1509  fact6 = __msa_fill_h(fact_val3);
1510  fact7 = __msa_fill_h(32 - fact_val3);
1511 
1512  UNPCK_UB_SH(top0, diff0, diff1);
1513  UNPCK_UB_SH(top1, diff2, diff3);
1514  UNPCK_UB_SH(top2, diff4, diff5);
1515  UNPCK_UB_SH(top3, diff6, diff7);
1516  SLDI_B4_SH(diff1, diff0, diff3, diff2, diff5, diff4, diff7, diff6, 2,
1517  diff1, diff3, diff5, diff7);
1518  MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
1519  diff1, diff3, diff5, diff7);
1520 
1521  diff1 += diff0 * fact1;
1522  diff3 += diff2 * fact3;
1523  diff5 += diff4 * fact5;
1524  diff7 += diff6 * fact7;
1525 
1526  SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
1527  PCKEV_B4_SB(diff1, diff1, diff3, diff3, diff5, diff5, diff7, diff7,
1528  dst_val0, dst_val1, dst_val2, dst_val3);
1529  ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
1530  ILVRL_H2_SH(diff1, diff0, diff3, diff4);
1531  ST_W8(diff3, diff4, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1532  dst += 4;
1533  }
1534 }
1535 
1537  const uint8_t *src_left,
1538  uint8_t *dst,
1539  int32_t stride,
1540  int32_t mode)
1541 {
1542  int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1543  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1544  int32_t idx2, fact_val2, idx3, fact_val3, tmp0;
1545  v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
1546  v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
1547  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1548  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1549  v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1550  int32_t angle, angle_loop, inv_angle_val, offset;
1551  uint8_t ref_array[3 * 32 + 4];
1552  uint8_t *ref_tmp = ref_array + 16;
1553  const uint8_t *ref, *src_top_tmp = src_top - 1;
1554  uint8_t *dst_org;
1555  int32_t last;
1556 
1557  angle = intra_pred_angle_low[mode - 2];
1558  last = (angle) >> 1;
1559  angle_loop = angle;
1560 
1561  ref = src_left - 1;
1562  if (last < -1) {
1563  inv_angle_val = inv_angle[mode - 11];
1564 
1565  top0 = LD_SB(ref);
1566  tmp0 = LW(ref + 16);
1567  ST_SB(top0, ref_tmp);
1568  SW(tmp0, ref_tmp + 16);
1569 
1570  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1571  offset = (h_cnt * inv_angle_val + 128) >> 8;
1572  ref_tmp[h_cnt] = src_top_tmp[offset];
1573  }
1574 
1575  ref = ref_tmp;
1576  }
1577 
1578  for (v_cnt = 0; v_cnt < 4; v_cnt++) {
1579  dst_org = dst;
1580 
1581  idx0 = angle_loop >> 5;
1582  fact_val0 = angle_loop & 31;
1583  angle_loop += angle;
1584 
1585  idx1 = angle_loop >> 5;
1586  fact_val1 = angle_loop & 31;
1587  angle_loop += angle;
1588 
1589  idx2 = angle_loop >> 5;
1590  fact_val2 = angle_loop & 31;
1591  angle_loop += angle;
1592 
1593  idx3 = angle_loop >> 5;
1594  fact_val3 = angle_loop & 31;
1595  angle_loop += angle;
1596 
1597  LD_SB2(ref + idx0 + 1, 16, top0, top1);
1598  LD_SB2(ref + idx1 + 1, 16, top2, top3);
1599  LD_SB2(ref + idx2 + 1, 16, top4, top5);
1600  LD_SB2(ref + idx3 + 1, 16, top6, top7);
1601 
1602  fact0 = __msa_fill_h(fact_val0);
1603  fact1 = __msa_fill_h(32 - fact_val0);
1604  fact2 = __msa_fill_h(fact_val1);
1605  fact3 = __msa_fill_h(32 - fact_val1);
1606  fact4 = __msa_fill_h(fact_val2);
1607  fact5 = __msa_fill_h(32 - fact_val2);
1608  fact6 = __msa_fill_h(fact_val3);
1609  fact7 = __msa_fill_h(32 - fact_val3);
1610 
1611  SLDI_B4_SB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1612  top1, top3, top5, top7);
1613 
1614  UNPCK_UB_SH(top0, diff0, diff1);
1615  UNPCK_UB_SH(top1, diff2, diff3);
1616  UNPCK_UB_SH(top2, diff4, diff5);
1617  UNPCK_UB_SH(top3, diff6, diff7);
1618  UNPCK_UB_SH(top4, diff8, diff9);
1619  UNPCK_UB_SH(top5, diff10, diff11);
1620  UNPCK_UB_SH(top6, diff12, diff13);
1621  UNPCK_UB_SH(top7, diff14, diff15);
1622 
1623  MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
1624  diff2, diff3, diff6, diff7);
1625  MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
1626  diff10, diff11, diff14, diff15);
1627 
1628  diff2 += diff0 * fact1;
1629  diff3 += diff1 * fact1;
1630  diff6 += diff4 * fact3;
1631  diff7 += diff5 * fact3;
1632  diff10 += diff8 * fact5;
1633  diff11 += diff9 * fact5;
1634  diff14 += diff12 * fact7;
1635  diff15 += diff13 * fact7;
1636 
1637  SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1638  SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1639  PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1640  dst_val0, dst_val1, dst_val2, dst_val3);
1641  ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
1642  ILVL_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff2, diff3);
1643  ILVRL_H2_SH(diff1, diff0, diff4, diff5);
1644  ILVRL_H2_SH(diff3, diff2, diff6, diff7);
1645  ST_W8(diff4, diff5, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1646  dst_org += (8 * stride);
1647  ST_W8(diff6, diff7, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1648  dst += 4;
1649  }
1650 }
1651 
1653  const uint8_t *src_left,
1654  uint8_t *dst,
1655  int32_t stride,
1656  int32_t mode)
1657 {
1658  int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1659  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1, tmp0;
1660  v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
1661  v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
1662  v8i16 fact0, fact1, fact2, fact3;
1663  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1664  v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1665  int32_t angle, angle_loop, inv_angle_val, offset;
1666  uint8_t ref_array[3 * 32 + 4];
1667  uint8_t *ref_tmp = ref_array + 32;
1668  const uint8_t *ref, *src_top_tmp = src_top - 1;
1669  uint8_t *dst_org;
1670  int32_t last;
1671 
1672  angle = intra_pred_angle_low[mode - 2];
1673  last = angle;
1674  angle_loop = angle;
1675 
1676  ref = src_left - 1;
1677  if (last < -1) {
1678  inv_angle_val = inv_angle[mode - 11];
1679 
1680  LD_SB2(ref, 16, top0, top1);
1681  tmp0 = LW(ref + 32);
1682  ST_SB2(top0, top1, ref_tmp, 16);
1683  SW(tmp0, ref_tmp + 32);
1684 
1685  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1686  offset = (h_cnt * inv_angle_val + 128) >> 8;
1687  ref_tmp[h_cnt] = src_top_tmp[offset];
1688  }
1689 
1690  ref = ref_tmp;
1691  }
1692 
1693  for (v_cnt = 0; v_cnt < 16; v_cnt++) {
1694  dst_org = dst;
1695  idx0 = angle_loop >> 5;
1696  fact_val0 = angle_loop & 31;
1697  angle_loop += angle;
1698 
1699  idx1 = angle_loop >> 5;
1700  fact_val1 = angle_loop & 31;
1701  angle_loop += angle;
1702 
1703  top0 = LD_SB(ref + idx0 + 1);
1704  top4 = LD_SB(ref + idx1 + 1);
1705  top1 = LD_SB(ref + idx0 + 17);
1706  top5 = LD_SB(ref + idx1 + 17);
1707  top3 = LD_SB(ref + idx0 + 33);
1708  top7 = LD_SB(ref + idx1 + 33);
1709 
1710  fact0 = __msa_fill_h(fact_val0);
1711  fact1 = __msa_fill_h(32 - fact_val0);
1712  fact2 = __msa_fill_h(fact_val1);
1713  fact3 = __msa_fill_h(32 - fact_val1);
1714 
1715  top2 = top1;
1716  top6 = top5;
1717 
1718  SLDI_B4_SB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1719  top1, top3, top5, top7);
1720 
1721  UNPCK_UB_SH(top0, diff0, diff1);
1722  UNPCK_UB_SH(top1, diff2, diff3);
1723  UNPCK_UB_SH(top2, diff4, diff5);
1724  UNPCK_UB_SH(top3, diff6, diff7);
1725  UNPCK_UB_SH(top4, diff8, diff9);
1726  UNPCK_UB_SH(top5, diff10, diff11);
1727  UNPCK_UB_SH(top6, diff12, diff13);
1728  UNPCK_UB_SH(top7, diff14, diff15);
1729 
1730  MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
1731  diff2, diff3, diff6, diff7);
1732  MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
1733  diff10, diff11, diff14, diff15);
1734 
1735  diff2 += diff0 * fact1;
1736  diff3 += diff1 * fact1;
1737  diff6 += diff4 * fact1;
1738  diff7 += diff5 * fact1;
1739  diff10 += diff8 * fact3;
1740  diff11 += diff9 * fact3;
1741  diff14 += diff12 * fact3;
1742  diff15 += diff13 * fact3;
1743 
1744  SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1745  SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1746  PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1747  dst_val0, dst_val1, dst_val2, dst_val3);
1748  ILVRL_B2_SH(dst_val2, dst_val0, diff0, diff1);
1749  ILVRL_B2_SH(dst_val3, dst_val1, diff2, diff3);
1750 
1751  ST_H8(diff0, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1752  dst_org += (8 * stride);
1753  ST_H8(diff1, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1754  dst_org += (8 * stride);
1755  ST_H8(diff2, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1756  dst_org += (8 * stride);
1757  ST_H8(diff3, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1758  dst_org += (8 * stride);
1759 
1760  dst += 2;
1761  }
1762 }
1763 
1765  int32_t dst_stride)
1766 {
1767  uint32_t row;
1768  v16u8 src1, src2;
1769 
1770  src1 = LD_UB(src);
1771  src2 = LD_UB(src + 16);
1772 
1773  for (row = 32; row--;) {
1774  ST_UB2(src1, src2, dst, 16);
1775  dst += dst_stride;
1776  }
1777 }
1778 
1780  const uint8_t *src_top,
1781  const uint8_t *src_left,
1782  ptrdiff_t stride)
1783 {
1784  hevc_intra_pred_plane_4x4_msa(src_top, src_left, dst, stride);
1785 }
1786 
1788  const uint8_t *src_top,
1789  const uint8_t *src_left,
1790  ptrdiff_t stride)
1791 {
1792  hevc_intra_pred_plane_8x8_msa(src_top, src_left, dst, stride);
1793 }
1794 
1796  const uint8_t *src_top,
1797  const uint8_t *src_left,
1798  ptrdiff_t stride)
1799 {
1800  hevc_intra_pred_plane_16x16_msa(src_top, src_left, dst, stride);
1801 }
1802 
1804  const uint8_t *src_top,
1805  const uint8_t *src_left,
1806  ptrdiff_t stride)
1807 {
1808  hevc_intra_pred_plane_32x32_msa(src_top, src_left, dst, stride);
1809 }
1810 
1811 void ff_hevc_intra_pred_dc_msa(uint8_t *dst, const uint8_t *src_top,
1812  const uint8_t *src_left,
1813  ptrdiff_t stride, int log2, int c_idx)
1814 {
1815  switch (log2) {
1816  case 2:
1817  hevc_intra_pred_dc_4x4_msa(src_top, src_left, dst, stride, c_idx);
1818  break;
1819 
1820  case 3:
1821  hevc_intra_pred_dc_8x8_msa(src_top, src_left, dst, stride, c_idx);
1822  break;
1823 
1824  case 4:
1825  hevc_intra_pred_dc_16x16_msa(src_top, src_left, dst, stride, c_idx);
1826  break;
1827 
1828  case 5:
1829  hevc_intra_pred_dc_32x32_msa(src_top, src_left, dst, stride);
1830  break;
1831  }
1832 }
1833 
1835  const uint8_t *src_top,
1836  const uint8_t *src_left,
1837  ptrdiff_t stride, int c_idx, int mode)
1838 {
1839  if (mode == 10) {
1840  hevc_intra_pred_horiz_4x4_msa(src_top, src_left, dst, stride, c_idx);
1841  } else if (mode == 26) {
1842  hevc_intra_pred_vert_4x4_msa(src_top, src_left, dst, stride, c_idx);
1843  } else if (mode >= 18) {
1844  hevc_intra_pred_angular_upper_4width_msa(src_top, src_left,
1845  dst, stride, mode);
1846  } else {
1847  hevc_intra_pred_angular_lower_4width_msa(src_top, src_left,
1848  dst, stride, mode);
1849  }
1850 }
1851 
1853  const uint8_t *src_top,
1854  const uint8_t *src_left,
1855  ptrdiff_t stride, int c_idx, int mode)
1856 {
1857  if (mode == 10) {
1858  hevc_intra_pred_horiz_8x8_msa(src_top, src_left, dst, stride, c_idx);
1859  } else if (mode == 26) {
1860  hevc_intra_pred_vert_8x8_msa(src_top, src_left, dst, stride, c_idx);
1861  } else if (mode >= 18) {
1862  hevc_intra_pred_angular_upper_8width_msa(src_top, src_left,
1863  dst, stride, mode);
1864  } else {
1865  hevc_intra_pred_angular_lower_8width_msa(src_top, src_left,
1866  dst, stride, mode);
1867  }
1868 }
1869 
1871  const uint8_t *src_top,
1872  const uint8_t *src_left,
1873  ptrdiff_t stride, int c_idx, int mode)
1874 {
1875  if (mode == 10) {
1876  hevc_intra_pred_horiz_16x16_msa(src_top, src_left, dst, stride, c_idx);
1877  } else if (mode == 26) {
1878  hevc_intra_pred_vert_16x16_msa(src_top, src_left, dst, stride, c_idx);
1879  } else if (mode >= 18) {
1881  dst, stride, mode);
1882  } else {
1884  dst, stride, mode);
1885  }
1886 }
1887 
1889  const uint8_t *src_top,
1890  const uint8_t *src_left,
1891  ptrdiff_t stride, int c_idx, int mode)
1892 {
1893  if (mode == 10) {
1894  hevc_intra_pred_horiz_32x32_msa(src_top, src_left, dst, stride);
1895  } else if (mode == 26) {
1896  intra_predict_vert_32x32_msa(src_top, dst, stride);
1897  } else if (mode >= 18) {
1899  dst, stride, mode);
1900  } else {
1902  dst, stride, mode);
1903  }
1904 }
1905 
1906 void ff_intra_pred_8_16x16_msa(HEVCContext *s, int x0, int y0, int c_idx)
1907 {
1908  v16u8 vec0;
1909  HEVCLocalContext *lc = s->HEVClc;
1910  int i;
1911  int hshift = s->ps.sps->hshift[c_idx];
1912  int vshift = s->ps.sps->vshift[c_idx];
1913  int size_in_luma_h = 16 << hshift;
1914  int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
1915  int size_in_luma_v = 16 << vshift;
1916  int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
1917  int x = x0 >> hshift;
1918  int y = y0 >> vshift;
1919  int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
1920  int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
1921 
1922  int cur_tb_addr =
1923  s->ps.pps->min_tb_addr_zs[(y_tb) * (s->ps.sps->tb_mask + 2) + (x_tb)];
1924 
1925  ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t);
1926  uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride;
1927 
1928  int min_pu_width = s->ps.sps->min_pu_width;
1929 
1930  enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
1931  lc->tu.intra_pred_mode;
1932  uint32_t a;
1933  uint8_t left_array[2 * 32 + 1];
1934  uint8_t filtered_left_array[2 * 32 + 1];
1935  uint8_t top_array[2 * 32 + 1];
1936  uint8_t filtered_top_array[2 * 32 + 1];
1937 
1938  uint8_t *left = left_array + 1;
1939  uint8_t *top = top_array + 1;
1940  uint8_t *filtered_left = filtered_left_array + 1;
1941  uint8_t *filtered_top = filtered_top_array + 1;
1942  int cand_bottom_left = lc->na.cand_bottom_left
1943  && cur_tb_addr >
1944  s->ps.pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->ps.sps->tb_mask) *
1945  (s->ps.sps->tb_mask + 2) + (x_tb - 1)];
1946  int cand_left = lc->na.cand_left;
1947  int cand_up_left = lc->na.cand_up_left;
1948  int cand_up = lc->na.cand_up;
1949  int cand_up_right = lc->na.cand_up_right
1950  && cur_tb_addr >
1951  s->ps.pps->min_tb_addr_zs[(y_tb - 1) * (s->ps.sps->tb_mask + 2) +
1952  ((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask)];
1953 
1954  int bottom_left_size =
1955  (((y0 + 2 * size_in_luma_v) >
1956  (s->ps.sps->height) ? (s->ps.sps->height) : (y0 +
1957  2 * size_in_luma_v)) -
1958  (y0 + size_in_luma_v)) >> vshift;
1959  int top_right_size =
1960  (((x0 + 2 * size_in_luma_h) >
1961  (s->ps.sps->width) ? (s->ps.sps->width) : (x0 + 2 * size_in_luma_h)) -
1962  (x0 + size_in_luma_h)) >> hshift;
1963 
1964  if (s->ps.pps->constrained_intra_pred_flag == 1) {
1965  int size_in_luma_pu_v = ((size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
1966  int size_in_luma_pu_h = ((size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
1967  int on_pu_edge_x = !(x0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
1968  int on_pu_edge_y = !(y0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
1969  if (!size_in_luma_pu_h)
1970  size_in_luma_pu_h++;
1971  if (cand_bottom_left == 1 && on_pu_edge_x) {
1972  int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
1973  int y_bottom_pu =
1974  ((y0 + size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
1975  int max =
1976  ((size_in_luma_pu_v) >
1977  (s->ps.sps->min_pu_height -
1978  y_bottom_pu) ? (s->ps.sps->min_pu_height -
1979  y_bottom_pu) : (size_in_luma_pu_v));
1980  cand_bottom_left = 0;
1981  for (i = 0; i < max; i += 2)
1982  cand_bottom_left |=
1983  ((s->ref->tab_mvf[(x_left_pu) +
1984  (y_bottom_pu +
1985  i) * min_pu_width]).pred_flag ==
1986  PF_INTRA);
1987  }
1988  if (cand_left == 1 && on_pu_edge_x) {
1989  int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
1990  int y_left_pu = ((y0) >> s->ps.sps->log2_min_pu_size);
1991  int max =
1992  ((size_in_luma_pu_v) >
1993  (s->ps.sps->min_pu_height -
1994  y_left_pu) ? (s->ps.sps->min_pu_height -
1995  y_left_pu) : (size_in_luma_pu_v));
1996  cand_left = 0;
1997  for (i = 0; i < max; i += 2)
1998  cand_left |=
1999  ((s->ref->tab_mvf[(x_left_pu) +
2000  (y_left_pu +
2001  i) * min_pu_width]).pred_flag ==
2002  PF_INTRA);
2003  }
2004  if (cand_up_left == 1) {
2005  int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2006  int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2007  cand_up_left =
2008  (s->ref->tab_mvf[(x_left_pu) +
2009  (y_top_pu) * min_pu_width]).pred_flag ==
2010  PF_INTRA;
2011  }
2012  if (cand_up == 1 && on_pu_edge_y) {
2013  int x_top_pu = ((x0) >> s->ps.sps->log2_min_pu_size);
2014  int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2015  int max =
2016  ((size_in_luma_pu_h) >
2017  (s->ps.sps->min_pu_width -
2018  x_top_pu) ? (s->ps.sps->min_pu_width -
2019  x_top_pu) : (size_in_luma_pu_h));
2020  cand_up = 0;
2021  for (i = 0; i < max; i += 2)
2022  cand_up |=
2023  ((s->ref->tab_mvf[(x_top_pu + i) +
2024  (y_top_pu) *
2025  min_pu_width]).pred_flag == PF_INTRA);
2026  }
2027  if (cand_up_right == 1 && on_pu_edge_y) {
2028  int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2029  int x_right_pu =
2030  ((x0 + size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
2031  int max =
2032  ((size_in_luma_pu_h) >
2033  (s->ps.sps->min_pu_width -
2034  x_right_pu) ? (s->ps.sps->min_pu_width -
2035  x_right_pu) : (size_in_luma_pu_h));
2036  cand_up_right = 0;
2037  for (i = 0; i < max; i += 2)
2038  cand_up_right |=
2039  ((s->ref->tab_mvf[(x_right_pu + i) +
2040  (y_top_pu) *
2041  min_pu_width]).pred_flag == PF_INTRA);
2042  }
2043 
2044  vec0 = (v16u8) __msa_ldi_b(128);
2045 
2046  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2047 
2048  ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2049 
2050  top[-1] = 128;
2051  }
2052  if (cand_up_left) {
2053  left[-1] = src[(-1) + stride * (-1)];
2054  top[-1] = left[-1];
2055  }
2056  if (cand_up) {
2057  vec0 = LD_UB(src - stride);
2058  ST_UB(vec0, top);
2059  }
2060  if (cand_up_right) {
2061  vec0 = LD_UB(src - stride + 16);
2062  ST_UB(vec0, (top + 16));
2063 
2064  do {
2065  uint32_t pix =
2066  ((src[(16 + top_right_size - 1) + stride * (-1)]) *
2067  0x01010101U);
2068  for (i = 0; i < (16 - top_right_size); i += 4)
2069  ((((union unaligned_32 *) (top + 16 + top_right_size +
2070  i))->l) = (pix));
2071  } while (0);
2072  }
2073  if (cand_left)
2074  for (i = 0; i < 16; i++)
2075  left[i] = src[(-1) + stride * (i)];
2076  if (cand_bottom_left) {
2077  for (i = 16; i < 16 + bottom_left_size; i++)
2078  left[i] = src[(-1) + stride * (i)];
2079  do {
2080  uint32_t pix =
2081  ((src[(-1) + stride * (16 + bottom_left_size - 1)]) *
2082  0x01010101U);
2083  for (i = 0; i < (16 - bottom_left_size); i += 4)
2084  ((((union unaligned_32 *) (left + 16 + bottom_left_size +
2085  i))->l) = (pix));
2086  } while (0);
2087  }
2088 
2089  if (s->ps.pps->constrained_intra_pred_flag == 1) {
2090  if (cand_bottom_left || cand_left || cand_up_left || cand_up
2091  || cand_up_right) {
2092  int size_max_x =
2093  x0 + ((2 * 16) << hshift) <
2094  s->ps.sps->width ? 2 * 16 : (s->ps.sps->width - x0) >> hshift;
2095  int size_max_y =
2096  y0 + ((2 * 16) << vshift) <
2097  s->ps.sps->height ? 2 * 16 : (s->ps.sps->height - y0) >> vshift;
2098  int j = 16 + (cand_bottom_left ? bottom_left_size : 0) - 1;
2099  if (!cand_up_right) {
2100  size_max_x = x0 + ((16) << hshift) < s->ps.sps->width ?
2101  16 : (s->ps.sps->width - x0) >> hshift;
2102  }
2103  if (!cand_bottom_left) {
2104  size_max_y = y0 + ((16) << vshift) < s->ps.sps->height ?
2105  16 : (s->ps.sps->height - y0) >> vshift;
2106  }
2107  if (cand_bottom_left || cand_left || cand_up_left) {
2108  while (j > -1
2109  &&
2110  !((s->ref->tab_mvf[(((x0 +
2111  ((-1) << hshift)) >> s->ps.sps->
2112  log2_min_pu_size)) + (((y0 +
2113  ((j) <<
2114  vshift))
2115  >> s->ps.sps->
2116  log2_min_pu_size))
2117  * min_pu_width]).pred_flag ==
2118  PF_INTRA))
2119  j--;
2120  if (!
2121  ((s->ref->tab_mvf[(((x0 +
2122  ((-1) << hshift)) >> s->ps.sps->
2123  log2_min_pu_size)) + (((y0 + ((j)
2124  <<
2125  vshift))
2126  >> s->ps.sps->
2127  log2_min_pu_size))
2128  * min_pu_width]).pred_flag == PF_INTRA)) {
2129  j = 0;
2130  while (j < size_max_x
2131  &&
2132  !((s->ref->tab_mvf[(((x0 +
2133  ((j) << hshift)) >> s->ps.sps->
2134  log2_min_pu_size)) + (((y0 +
2135  ((-1) <<
2136  vshift))
2137  >> s->
2138  ps.sps->
2139  log2_min_pu_size))
2140  * min_pu_width]).pred_flag ==
2141  PF_INTRA))
2142  j++;
2143  for (i = j; i > (j) - (j + 1); i--)
2144  if (!
2145  ((s->ref->tab_mvf[(((x0 +
2146  ((i -
2147  1) << hshift)) >> s->ps.sps->
2148  log2_min_pu_size)) + (((y0 +
2149  ((-1) <<
2150  vshift))
2151  >> s->
2152  ps.sps->
2153  log2_min_pu_size))
2154  * min_pu_width]).pred_flag ==
2155  PF_INTRA))
2156  top[i - 1] = top[i];
2157  left[-1] = top[-1];
2158  }
2159  } else {
2160  j = 0;
2161  while (j < size_max_x
2162  &&
2163  !((s->ref->tab_mvf[(((x0 +
2164  ((j) << hshift)) >> s->ps.sps->
2165  log2_min_pu_size)) + (((y0 + ((-1)
2166  <<
2167  vshift))
2168  >> s->ps.sps->
2169  log2_min_pu_size))
2170  * min_pu_width]).pred_flag ==
2171  PF_INTRA))
2172  j++;
2173  if (j > 0)
2174  if (x0 > 0) {
2175  for (i = j; i > (j) - (j + 1); i--)
2176  if (!
2177  ((s->ref->tab_mvf[(((x0 +
2178  ((i -
2179  1) << hshift)) >>
2180  s->ps.sps->log2_min_pu_size))
2181  + (((y0 + ((-1)
2182  << vshift))
2183  >>
2184  s->ps.sps->log2_min_pu_size))
2185  *
2186  min_pu_width]).pred_flag ==
2187  PF_INTRA))
2188  top[i - 1] = top[i];
2189  } else {
2190  for (i = j; i > (j) - (j); i--)
2191  if (!
2192  ((s->ref->tab_mvf[(((x0 +
2193  ((i -
2194  1) << hshift)) >>
2195  s->ps.sps->log2_min_pu_size))
2196  + (((y0 + ((-1)
2197  << vshift))
2198  >>
2199  s->ps.sps->log2_min_pu_size))
2200  *
2201  min_pu_width]).pred_flag ==
2202  PF_INTRA))
2203  top[i - 1] = top[i];
2204  top[-1] = top[0];
2205  }
2206  left[-1] = top[-1];
2207  }
2208  left[-1] = top[-1];
2209  if (cand_bottom_left || cand_left) {
2210  a = ((left[-1]) * 0x01010101U);
2211  for (i = 0; i < (0) + (size_max_y); i += 4)
2212  if (!
2213  ((s->ref->tab_mvf[(((x0 +
2214  ((-1) << hshift)) >> s->ps.sps->
2215  log2_min_pu_size)) + (((y0 +
2216  ((i) <<
2217  vshift))
2218  >> s->ps.sps->
2219  log2_min_pu_size))
2220  * min_pu_width]).pred_flag ==
2221  PF_INTRA))
2222  ((((union unaligned_32 *) (&left[i]))->l) = (a));
2223  else
2224  a = ((left[i + 3]) * 0x01010101U);
2225  }
2226  if (!cand_left) {
2227  vec0 = (v16u8) __msa_fill_b(left[-1]);
2228 
2229  ST_UB(vec0, left);
2230  }
2231  if (!cand_bottom_left) {
2232 
2233  vec0 = (v16u8) __msa_fill_b(left[15]);
2234 
2235  ST_UB(vec0, (left + 16));
2236  }
2237  if (x0 != 0 && y0 != 0) {
2238  a = ((left[size_max_y - 1]) * 0x01010101U);
2239  for (i = (size_max_y - 1);
2240  i > (size_max_y - 1) - (size_max_y); i -= 4)
2241  if (!
2242  ((s->ref->tab_mvf[(((x0 +
2243  ((-1) << hshift)) >> s->ps.sps->
2244  log2_min_pu_size)) + (((y0 +
2245  ((i -
2246  3) <<
2247  vshift))
2248  >> s->ps.sps->
2249  log2_min_pu_size))
2250  * min_pu_width]).pred_flag ==
2251  PF_INTRA))
2252  ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2253  else
2254  a = ((left[i - 3]) * 0x01010101U);
2255  if (!
2256  ((s->ref->tab_mvf[(((x0 +
2257  ((-1) << hshift)) >> s->ps.sps->
2258  log2_min_pu_size)) + (((y0 + ((-1)
2259  <<
2260  vshift))
2261  >> s->ps.sps->
2262  log2_min_pu_size))
2263  * min_pu_width]).pred_flag == PF_INTRA))
2264  left[-1] = left[0];
2265  } else if (x0 == 0) {
2266  do {
2267  uint32_t pix = ((0) * 0x01010101U);
2268  for (i = 0; i < (size_max_y); i += 4)
2269  ((((union unaligned_32 *) (left + i))->l) = (pix));
2270  } while (0);
2271  } else {
2272  a = ((left[size_max_y - 1]) * 0x01010101U);
2273  for (i = (size_max_y - 1);
2274  i > (size_max_y - 1) - (size_max_y); i -= 4)
2275  if (!
2276  ((s->ref->tab_mvf[(((x0 +
2277  ((-1) << hshift)) >> s->ps.sps->
2278  log2_min_pu_size)) + (((y0 +
2279  ((i -
2280  3) <<
2281  vshift))
2282  >> s->ps.sps->
2283  log2_min_pu_size))
2284  * min_pu_width]).pred_flag ==
2285  PF_INTRA))
2286  ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2287  else
2288  a = ((left[i - 3]) * 0x01010101U);
2289  }
2290  top[-1] = left[-1];
2291  if (y0 != 0) {
2292  a = ((left[-1]) * 0x01010101U);
2293  for (i = 0; i < (0) + (size_max_x); i += 4)
2294  if (!
2295  ((s->ref->tab_mvf[(((x0 +
2296  ((i) << hshift)) >> s->ps.sps->
2297  log2_min_pu_size)) + (((y0 + ((-1)
2298  <<
2299  vshift))
2300  >> s->ps.sps->
2301  log2_min_pu_size))
2302  * min_pu_width]).pred_flag ==
2303  PF_INTRA))
2304  ((((union unaligned_32 *) (&top[i]))->l) = (a));
2305  else
2306  a = ((top[i + 3]) * 0x01010101U);
2307  }
2308  }
2309  }
2310 
2311  if (!cand_bottom_left) {
2312  if (cand_left) {
2313  vec0 = (v16u8) __msa_fill_b(left[15]);
2314 
2315  ST_UB(vec0, (left + 16));
2316 
2317  } else if (cand_up_left) {
2318  vec0 = (v16u8) __msa_fill_b(left[-1]);
2319 
2320  ST_UB2(vec0, vec0, left, 16);
2321 
2322  cand_left = 1;
2323  } else if (cand_up) {
2324  left[-1] = top[0];
2325 
2326  vec0 = (v16u8) __msa_fill_b(left[-1]);
2327 
2328  ST_UB2(vec0, vec0, left, 16);
2329 
2330  cand_up_left = 1;
2331  cand_left = 1;
2332  } else if (cand_up_right) {
2333  vec0 = (v16u8) __msa_fill_b(top[16]);
2334 
2335  ST_UB(vec0, top);
2336 
2337  left[-1] = top[16];
2338 
2339  ST_UB2(vec0, vec0, left, 16);
2340 
2341  cand_up = 1;
2342  cand_up_left = 1;
2343  cand_left = 1;
2344  } else {
2345  left[-1] = 128;
2346  vec0 = (v16u8) __msa_ldi_b(128);
2347 
2348  ST_UB2(vec0, vec0, top, 16);
2349  ST_UB2(vec0, vec0, left, 16);
2350  }
2351  }
2352 
2353  if (!cand_left) {
2354  vec0 = (v16u8) __msa_fill_b(left[16]);
2355  ST_UB(vec0, left);
2356  }
2357  if (!cand_up_left) {
2358  left[-1] = left[0];
2359  }
2360  if (!cand_up) {
2361  vec0 = (v16u8) __msa_fill_b(left[-1]);
2362  ST_UB(vec0, top);
2363  }
2364  if (!cand_up_right) {
2365  vec0 = (v16u8) __msa_fill_b(top[15]);
2366  ST_UB(vec0, (top + 16));
2367  }
2368 
2369  top[-1] = left[-1];
2370 
2371 
2373  && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) {
2374  if (mode != INTRA_DC && 16 != 4) {
2375  int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
2376  int min_dist_vert_hor =
2377  (((((int) (mode - 26U)) >=
2378  0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) >
2379  ((((int) (mode - 10U)) >=
2380  0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2381  ? ((((int) (mode - 10U)) >=
2382  0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2383  : ((((int) (mode - 26U)) >=
2384  0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))));
2385  if (min_dist_vert_hor > intra_hor_ver_dist_thresh[4 - 3]) {
2386  filtered_left[2 * 16 - 1] = left[2 * 16 - 1];
2387  filtered_top[2 * 16 - 1] = top[2 * 16 - 1];
2388  for (i = 2 * 16 - 2; i >= 0; i--)
2389  filtered_left[i] = (left[i + 1] + 2 * left[i] +
2390  left[i - 1] + 2) >> 2;
2391  filtered_top[-1] =
2392  filtered_left[-1] =
2393  (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
2394  for (i = 2 * 16 - 2; i >= 0; i--)
2395  filtered_top[i] = (top[i + 1] + 2 * top[i] +
2396  top[i - 1] + 2) >> 2;
2397  left = filtered_left;
2398  top = filtered_top;
2399  }
2400  }
2401  }
2402 
2403  switch (mode) {
2404  case INTRA_PLANAR:
2405  s->hpc.pred_planar[4 - 2] ((uint8_t *) src, (uint8_t *) top,
2406  (uint8_t *) left, stride);
2407  break;
2408  case INTRA_DC:
2409  s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top,
2410  (uint8_t *) left, stride, 4, c_idx);
2411  break;
2412  default:
2413  s->hpc.pred_angular[4 - 2] ((uint8_t *) src, (uint8_t *) top,
2414  (uint8_t *) left, stride, c_idx, mode);
2415  break;
2416  }
2417 }
2418 
2419 void ff_intra_pred_8_32x32_msa(HEVCContext *s, int x0, int y0, int c_idx)
2420 {
2421  v16u8 vec0, vec1;
2422  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2423  v8i16 res0, res1, res2, res3;
2424  v8i16 mul_val0 = { 63, 62, 61, 60, 59, 58, 57, 56 };
2425  v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
2426  HEVCLocalContext *lc = s->HEVClc;
2427  int i;
2428  int hshift = s->ps.sps->hshift[c_idx];
2429  int vshift = s->ps.sps->vshift[c_idx];
2430  int size_in_luma_h = 32 << hshift;
2431  int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
2432  int size_in_luma_v = 32 << vshift;
2433  int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
2434  int x = x0 >> hshift;
2435  int y = y0 >> vshift;
2436  int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
2437  int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
2438 
2439  int cur_tb_addr =
2440  s->ps.pps->min_tb_addr_zs[(y_tb) * (s->ps.sps->tb_mask + 2) + (x_tb)];
2441 
2442  ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t);
2443  uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride;
2444 
2445  int min_pu_width = s->ps.sps->min_pu_width;
2446 
2447  enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
2448  lc->tu.intra_pred_mode;
2449  uint32_t a;
2450  uint8_t left_array[2 * 32 + 1];
2451  uint8_t filtered_left_array[2 * 32 + 1];
2452  uint8_t top_array[2 * 32 + 1];
2453  uint8_t filtered_top_array[2 * 32 + 1];
2454 
2455  uint8_t *left = left_array + 1;
2456  uint8_t *top = top_array + 1;
2457  uint8_t *filtered_left = filtered_left_array + 1;
2458  uint8_t *filtered_top = filtered_top_array + 1;
2459  int cand_bottom_left = lc->na.cand_bottom_left
2460  && cur_tb_addr >
2461  s->ps.pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->ps.sps->tb_mask) *
2462  (s->ps.sps->tb_mask + 2) + (x_tb - 1)];
2463  int cand_left = lc->na.cand_left;
2464  int cand_up_left = lc->na.cand_up_left;
2465  int cand_up = lc->na.cand_up;
2466  int cand_up_right = lc->na.cand_up_right
2467  && cur_tb_addr >
2468  s->ps.pps->min_tb_addr_zs[(y_tb - 1) * (s->ps.sps->tb_mask + 2) +
2469  ((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask)];
2470 
2471  int bottom_left_size =
2472  (((y0 + 2 * size_in_luma_v) >
2473  (s->ps.sps->height) ? (s->ps.sps->height) : (y0 +
2474  2 * size_in_luma_v)) -
2475  (y0 + size_in_luma_v)) >> vshift;
2476  int top_right_size =
2477  (((x0 + 2 * size_in_luma_h) >
2478  (s->ps.sps->width) ? (s->ps.sps->width) : (x0 + 2 * size_in_luma_h)) -
2479  (x0 + size_in_luma_h)) >> hshift;
2480 
2481  if (s->ps.pps->constrained_intra_pred_flag == 1) {
2482  int size_in_luma_pu_v = ((size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
2483  int size_in_luma_pu_h = ((size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
2484  int on_pu_edge_x = !(x0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
2485  int on_pu_edge_y = !(y0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
2486  if (!size_in_luma_pu_h)
2487  size_in_luma_pu_h++;
2488  if (cand_bottom_left == 1 && on_pu_edge_x) {
2489  int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2490  int y_bottom_pu =
2491  ((y0 + size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
2492  int max =
2493  ((size_in_luma_pu_v) >
2494  (s->ps.sps->min_pu_height -
2495  y_bottom_pu) ? (s->ps.sps->min_pu_height -
2496  y_bottom_pu) : (size_in_luma_pu_v));
2497  cand_bottom_left = 0;
2498  for (i = 0; i < max; i += 2)
2499  cand_bottom_left |=
2500  ((s->ref->tab_mvf[(x_left_pu) +
2501  (y_bottom_pu +
2502  i) * min_pu_width]).pred_flag ==
2503  PF_INTRA);
2504  }
2505  if (cand_left == 1 && on_pu_edge_x) {
2506  int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2507  int y_left_pu = ((y0) >> s->ps.sps->log2_min_pu_size);
2508  int max =
2509  ((size_in_luma_pu_v) >
2510  (s->ps.sps->min_pu_height -
2511  y_left_pu) ? (s->ps.sps->min_pu_height -
2512  y_left_pu) : (size_in_luma_pu_v));
2513  cand_left = 0;
2514  for (i = 0; i < max; i += 2)
2515  cand_left |=
2516  ((s->ref->tab_mvf[(x_left_pu) +
2517  (y_left_pu +
2518  i) * min_pu_width]).pred_flag ==
2519  PF_INTRA);
2520  }
2521  if (cand_up_left == 1) {
2522  int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2523  int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2524  cand_up_left =
2525  (s->ref->tab_mvf[(x_left_pu) +
2526  (y_top_pu) * min_pu_width]).pred_flag ==
2527  PF_INTRA;
2528  }
2529  if (cand_up == 1 && on_pu_edge_y) {
2530  int x_top_pu = ((x0) >> s->ps.sps->log2_min_pu_size);
2531  int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2532  int max =
2533  ((size_in_luma_pu_h) >
2534  (s->ps.sps->min_pu_width -
2535  x_top_pu) ? (s->ps.sps->min_pu_width -
2536  x_top_pu) : (size_in_luma_pu_h));
2537  cand_up = 0;
2538  for (i = 0; i < max; i += 2)
2539  cand_up |=
2540  ((s->ref->tab_mvf[(x_top_pu + i) +
2541  (y_top_pu) *
2542  min_pu_width]).pred_flag == PF_INTRA);
2543  }
2544  if (cand_up_right == 1 && on_pu_edge_y) {
2545  int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2546  int x_right_pu =
2547  ((x0 + size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
2548  int max =
2549  ((size_in_luma_pu_h) >
2550  (s->ps.sps->min_pu_width -
2551  x_right_pu) ? (s->ps.sps->min_pu_width -
2552  x_right_pu) : (size_in_luma_pu_h));
2553  cand_up_right = 0;
2554  for (i = 0; i < max; i += 2)
2555  cand_up_right |=
2556  ((s->ref->tab_mvf[(x_right_pu + i) +
2557  (y_top_pu) *
2558  min_pu_width]).pred_flag == PF_INTRA);
2559  }
2560  vec0 = (v16u8) __msa_ldi_b(128);
2561 
2562  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2563  ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2564 
2565  top[-1] = 128;
2566  }
2567  if (cand_up_left) {
2568  left[-1] = src[(-1) + stride * (-1)];
2569  top[-1] = left[-1];
2570  }
2571  if (cand_up) {
2572  LD_UB2(src - stride, 16, vec0, vec1);
2573  ST_UB2(vec0, vec1, top, 16);
2574  }
2575 
2576  if (cand_up_right) {
2577  LD_UB2(src - stride + 32, 16, vec0, vec1);
2578  ST_UB2(vec0, vec1, (top + 32), 16);
2579  do {
2580  uint32_t pix =
2581  ((src[(32 + top_right_size - 1) + stride * (-1)]) *
2582  0x01010101U);
2583  for (i = 0; i < (32 - top_right_size); i += 4)
2584  ((((union unaligned_32 *) (top + 32 + top_right_size +
2585  i))->l) = (pix));
2586  } while (0);
2587  }
2588  if (cand_left)
2589  for (i = 0; i < 32; i++)
2590  left[i] = src[(-1) + stride * (i)];
2591  if (cand_bottom_left) {
2592  for (i = 32; i < 32 + bottom_left_size; i++)
2593  left[i] = src[(-1) + stride * (i)];
2594  do {
2595  uint32_t pix =
2596  ((src[(-1) + stride * (32 + bottom_left_size - 1)]) *
2597  0x01010101U);
2598  for (i = 0; i < (32 - bottom_left_size); i += 4)
2599  ((((union unaligned_32 *) (left + 32 + bottom_left_size +
2600  i))->l) = (pix));
2601  } while (0);
2602  }
2603 
2604  if (s->ps.pps->constrained_intra_pred_flag == 1) {
2605  if (cand_bottom_left || cand_left || cand_up_left || cand_up
2606  || cand_up_right) {
2607  int size_max_x =
2608  x0 + ((2 * 32) << hshift) <
2609  s->ps.sps->width ? 2 * 32 : (s->ps.sps->width - x0) >> hshift;
2610  int size_max_y =
2611  y0 + ((2 * 32) << vshift) <
2612  s->ps.sps->height ? 2 * 32 : (s->ps.sps->height - y0) >> vshift;
2613  int j = 32 + (cand_bottom_left ? bottom_left_size : 0) - 1;
2614  if (!cand_up_right) {
2615  size_max_x = x0 + ((32) << hshift) < s->ps.sps->width ?
2616  32 : (s->ps.sps->width - x0) >> hshift;
2617  }
2618  if (!cand_bottom_left) {
2619  size_max_y = y0 + ((32) << vshift) < s->ps.sps->height ?
2620  32 : (s->ps.sps->height - y0) >> vshift;
2621  }
2622  if (cand_bottom_left || cand_left || cand_up_left) {
2623  while (j > -1
2624  &&
2625  !((s->ref->tab_mvf[(((x0 +
2626  ((-1) << hshift)) >> s->ps.sps->
2627  log2_min_pu_size)) + (((y0 +
2628  ((j) <<
2629  vshift))
2630  >> s->ps.sps->
2631  log2_min_pu_size))
2632  * min_pu_width]).pred_flag ==
2633  PF_INTRA))
2634  j--;
2635  if (!
2636  ((s->ref->tab_mvf[(((x0 +
2637  ((-1) << hshift)) >> s->ps.sps->
2638  log2_min_pu_size)) + (((y0 + ((j)
2639  <<
2640  vshift))
2641  >> s->ps.sps->
2642  log2_min_pu_size))
2643  * min_pu_width]).pred_flag == PF_INTRA)) {
2644  j = 0;
2645  while (j < size_max_x
2646  &&
2647  !((s->ref->tab_mvf[(((x0 +
2648  ((j) << hshift)) >> s->ps.sps->
2649  log2_min_pu_size)) + (((y0 +
2650  ((-1) <<
2651  vshift))
2652  >> s->
2653  ps.sps->
2654  log2_min_pu_size))
2655  * min_pu_width]).pred_flag ==
2656  PF_INTRA))
2657  j++;
2658  for (i = j; i > (j) - (j + 1); i--)
2659  if (!
2660  ((s->ref->tab_mvf[(((x0 +
2661  ((i -
2662  1) << hshift)) >> s->ps.sps->
2663  log2_min_pu_size)) + (((y0 +
2664  ((-1) <<
2665  vshift))
2666  >> s->
2667  ps.sps->
2668  log2_min_pu_size))
2669  * min_pu_width]).pred_flag ==
2670  PF_INTRA))
2671  top[i - 1] = top[i];
2672  left[-1] = top[-1];
2673  }
2674  } else {
2675  j = 0;
2676  while (j < size_max_x
2677  &&
2678  !((s->ref->tab_mvf[(((x0 +
2679  ((j) << hshift)) >> s->ps.sps->
2680  log2_min_pu_size)) + (((y0 + ((-1)
2681  <<
2682  vshift))
2683  >> s->ps.sps->
2684  log2_min_pu_size))
2685  * min_pu_width]).pred_flag ==
2686  PF_INTRA))
2687  j++;
2688  if (j > 0)
2689  if (x0 > 0) {
2690  for (i = j; i > (j) - (j + 1); i--)
2691  if (!
2692  ((s->ref->tab_mvf[(((x0 +
2693  ((i -
2694  1) << hshift)) >>
2695  s->ps.sps->log2_min_pu_size))
2696  + (((y0 + ((-1)
2697  << vshift))
2698  >>
2699  s->ps.sps->log2_min_pu_size))
2700  *
2701  min_pu_width]).pred_flag ==
2702  PF_INTRA))
2703  top[i - 1] = top[i];
2704  } else {
2705  for (i = j; i > (j) - (j); i--)
2706  if (!
2707  ((s->ref->tab_mvf[(((x0 +
2708  ((i -
2709  1) << hshift)) >>
2710  s->ps.sps->log2_min_pu_size))
2711  + (((y0 + ((-1)
2712  << vshift))
2713  >>
2714  s->ps.sps->log2_min_pu_size))
2715  *
2716  min_pu_width]).pred_flag ==
2717  PF_INTRA))
2718  top[i - 1] = top[i];
2719  top[-1] = top[0];
2720  }
2721  left[-1] = top[-1];
2722  }
2723  left[-1] = top[-1];
2724  if (cand_bottom_left || cand_left) {
2725  a = ((left[-1]) * 0x01010101U);
2726  for (i = 0; i < (0) + (size_max_y); i += 4)
2727  if (!
2728  ((s->ref->tab_mvf[(((x0 +
2729  ((-1) << hshift)) >> s->ps.sps->
2730  log2_min_pu_size)) + (((y0 +
2731  ((i) <<
2732  vshift))
2733  >> s->ps.sps->
2734  log2_min_pu_size))
2735  * min_pu_width]).pred_flag ==
2736  PF_INTRA))
2737  ((((union unaligned_32 *) (&left[i]))->l) = (a));
2738  else
2739  a = ((left[i + 3]) * 0x01010101U);
2740  }
2741  if (!cand_left) {
2742  vec0 = (v16u8) __msa_fill_b(left[-1]);
2743 
2744  ST_UB2(vec0, vec0, left, 16);
2745  }
2746  if (!cand_bottom_left) {
2747  vec0 = (v16u8) __msa_fill_b(left[31]);
2748 
2749  ST_UB2(vec0, vec0, (left + 32), 16);
2750  }
2751  if (x0 != 0 && y0 != 0) {
2752  a = ((left[size_max_y - 1]) * 0x01010101U);
2753  for (i = (size_max_y - 1);
2754  i > (size_max_y - 1) - (size_max_y); i -= 4)
2755  if (!
2756  ((s->ref->tab_mvf[(((x0 +
2757  ((-1) << hshift)) >> s->ps.sps->
2758  log2_min_pu_size)) + (((y0 +
2759  ((i -
2760  3) <<
2761  vshift))
2762  >> s->ps.sps->
2763  log2_min_pu_size))
2764  * min_pu_width]).pred_flag ==
2765  PF_INTRA))
2766  ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2767  else
2768  a = ((left[i - 3]) * 0x01010101U);
2769  if (!
2770  ((s->ref->tab_mvf[(((x0 +
2771  ((-1) << hshift)) >> s->ps.sps->
2772  log2_min_pu_size)) + (((y0 + ((-1)
2773  <<
2774  vshift))
2775  >> s->ps.sps->
2776  log2_min_pu_size))
2777  * min_pu_width]).pred_flag == PF_INTRA))
2778  left[-1] = left[0];
2779  } else if (x0 == 0) {
2780  do {
2781  uint32_t pix = ((0) * 0x01010101U);
2782  for (i = 0; i < (size_max_y); i += 4)
2783  ((((union unaligned_32 *) (left + i))->l) = (pix));
2784  } while (0);
2785  } else {
2786  a = ((left[size_max_y - 1]) * 0x01010101U);
2787  for (i = (size_max_y - 1);
2788  i > (size_max_y - 1) - (size_max_y); i -= 4)
2789  if (!
2790  ((s->ref->tab_mvf[(((x0 +
2791  ((-1) << hshift)) >> s->ps.sps->
2792  log2_min_pu_size)) + (((y0 +
2793  ((i -
2794  3) <<
2795  vshift))
2796  >> s->ps.sps->
2797  log2_min_pu_size))
2798  * min_pu_width]).pred_flag ==
2799  PF_INTRA))
2800  ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2801  else
2802  a = ((left[i - 3]) * 0x01010101U);
2803  }
2804  top[-1] = left[-1];
2805  if (y0 != 0) {
2806  a = ((left[-1]) * 0x01010101U);
2807  for (i = 0; i < (0) + (size_max_x); i += 4)
2808  if (!
2809  ((s->ref->tab_mvf[(((x0 +
2810  ((i) << hshift)) >> s->ps.sps->
2811  log2_min_pu_size)) + (((y0 + ((-1)
2812  <<
2813  vshift))
2814  >> s->ps.sps->
2815  log2_min_pu_size))
2816  * min_pu_width]).pred_flag ==
2817  PF_INTRA))
2818  ((((union unaligned_32 *) (&top[i]))->l) = (a));
2819  else
2820  a = ((top[i + 3]) * 0x01010101U);
2821  }
2822  }
2823  }
2824 
2825  if (!cand_bottom_left) {
2826  if (cand_left) {
2827  vec0 = (v16u8) __msa_fill_b(left[31]);
2828 
2829  ST_UB2(vec0, vec0, (left + 32), 16);
2830  } else if (cand_up_left) {
2831  vec0 = (v16u8) __msa_fill_b(left[-1]);
2832 
2833  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2834 
2835  cand_left = 1;
2836  } else if (cand_up) {
2837  left[-1] = top[0];
2838 
2839  vec0 = (v16u8) __msa_fill_b(left[-1]);
2840 
2841  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2842 
2843  cand_up_left = 1;
2844  cand_left = 1;
2845  } else if (cand_up_right) {
2846  vec0 = (v16u8) __msa_fill_b(top[32]);
2847 
2848  ST_UB2(vec0, vec0, top, 16);
2849 
2850  left[-1] = top[32];
2851 
2852  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2853 
2854  cand_up = 1;
2855  cand_up_left = 1;
2856  cand_left = 1;
2857  } else {
2858  left[-1] = 128;
2859 
2860  vec0 = (v16u8) __msa_ldi_b(128);
2861 
2862  ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2863  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2864  }
2865  }
2866 
2867  if (!cand_left) {
2868  vec0 = (v16u8) __msa_fill_b(left[32]);
2869 
2870  ST_UB2(vec0, vec0, left, 16);
2871  }
2872  if (!cand_up_left) {
2873  left[-1] = left[0];
2874  }
2875  if (!cand_up) {
2876  vec0 = (v16u8) __msa_fill_b(left[-1]);
2877 
2878  ST_UB2(vec0, vec0, top, 16);
2879  }
2880  if (!cand_up_right) {
2881  vec0 = (v16u8) __msa_fill_b(top[31]);
2882 
2883  ST_UB2(vec0, vec0, (top + 32), 16);
2884  }
2885 
2886  top[-1] = left[-1];
2887 
2888 
2890  && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) {
2891  if (mode != INTRA_DC && 32 != 4) {
2892  int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
2893  int min_dist_vert_hor =
2894  (((((int) (mode - 26U)) >=
2895  0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) >
2896  ((((int) (mode - 10U)) >=
2897  0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2898  ? ((((int) (mode - 10U)) >=
2899  0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2900  : ((((int) (mode - 26U)) >=
2901  0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))));
2902  if (min_dist_vert_hor > intra_hor_ver_dist_thresh[5 - 3]) {
2903  int threshold = 1 << (8 - 5);
2905  && c_idx == 0
2906  && ((top[-1] + top[63] - 2 * top[31]) >=
2907  0 ? (top[-1] + top[63] -
2908  2 * top[31]) : (-(top[-1] + top[63] -
2909  2 * top[31]))) < threshold
2910  && ((left[-1] + left[63] - 2 * left[31]) >=
2911  0 ? (left[-1] + left[63] -
2912  2 * left[31]) : (-(left[-1] + left[63] -
2913  2 * left[31]))) < threshold) {
2914 
2915 
2916  filtered_top[-1] = top[-1];
2917  filtered_top[63] = top[63];
2918 
2919 
2920  for (i = 0; i < 63; i++) {
2921  filtered_top[i] =
2922  ((63 - i) * top[-1] + (i + 1) * top[63] + 32) >> 6;
2923  }
2924 
2925  tmp0 = __msa_fill_h(top[-1]);
2926  tmp1 = __msa_fill_h(top[63]);
2927 
2928  tmp2 = mul_val0 - 8;
2929  tmp3 = mul_val0 - 16;
2930  tmp4 = mul_val0 - 24;
2931  tmp5 = mul_val1 + 8;
2932  tmp6 = mul_val1 + 16;
2933  tmp7 = mul_val1 + 24;
2934 
2935  res0 = mul_val0 * tmp0;
2936  res1 = tmp2 * tmp0;
2937  res2 = tmp3 * tmp0;
2938  res3 = tmp4 * tmp0;
2939  res0 += mul_val1 * tmp1;
2940  res1 += tmp5 * tmp1;
2941  res2 += tmp6 * tmp1;
2942  res3 += tmp7 * tmp1;
2943 
2944  res0 = __msa_srari_h(res0, 6);
2945  res1 = __msa_srari_h(res1, 6);
2946  res2 = __msa_srari_h(res2, 6);
2947  res3 = __msa_srari_h(res3, 6);
2948 
2949  vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
2950  vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
2951 
2952  ST_UB2(vec0, vec1, filtered_top, 16);
2953 
2954  res0 = mul_val0 - 32;
2955  tmp2 = mul_val0 - 40;
2956  tmp3 = mul_val0 - 48;
2957  tmp4 = mul_val0 - 56;
2958  res3 = mul_val1 + 32;
2959  tmp5 = mul_val1 + 40;
2960  tmp6 = mul_val1 + 48;
2961  tmp7 = mul_val1 + 56;
2962 
2963  res0 = res0 * tmp0;
2964  res1 = tmp2 * tmp0;
2965  res2 = tmp3 * tmp0;
2966  res0 += res3 * tmp1;
2967  res3 = tmp4 * tmp0;
2968  res1 += tmp5 * tmp1;
2969  res2 += tmp6 * tmp1;
2970  res3 += tmp7 * tmp1;
2971 
2972  res0 = __msa_srari_h(res0, 6);
2973  res1 = __msa_srari_h(res1, 6);
2974  res2 = __msa_srari_h(res2, 6);
2975  res3 = __msa_srari_h(res3, 6);
2976 
2977  vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
2978  vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
2979 
2980  ST_UB2(vec0, vec1, (filtered_top + 32), 16);
2981 
2982  filtered_top[63] = top[63];
2983 
2984  tmp0 = __msa_fill_h(left[-1]);
2985  tmp1 = __msa_fill_h(left[63]);
2986 
2987  tmp2 = mul_val0 - 8;
2988  tmp3 = mul_val0 - 16;
2989  tmp4 = mul_val0 - 24;
2990  tmp5 = mul_val1 + 8;
2991  tmp6 = mul_val1 + 16;
2992  tmp7 = mul_val1 + 24;
2993 
2994  res0 = mul_val0 * tmp0;
2995  res1 = tmp2 * tmp0;
2996  res2 = tmp3 * tmp0;
2997  res3 = tmp4 * tmp0;
2998  res0 += mul_val1 * tmp1;
2999  res1 += tmp5 * tmp1;
3000  res2 += tmp6 * tmp1;
3001  res3 += tmp7 * tmp1;
3002 
3003  res0 = __msa_srari_h(res0, 6);
3004  res1 = __msa_srari_h(res1, 6);
3005  res2 = __msa_srari_h(res2, 6);
3006  res3 = __msa_srari_h(res3, 6);
3007 
3008  vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
3009  vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
3010 
3011  ST_UB2(vec0, vec1, left, 16);
3012 
3013  res0 = mul_val0 - 32;
3014  tmp2 = mul_val0 - 40;
3015  tmp3 = mul_val0 - 48;
3016  tmp4 = mul_val0 - 56;
3017  res3 = mul_val1 + 32;
3018  tmp5 = mul_val1 + 40;
3019  tmp6 = mul_val1 + 48;
3020  tmp7 = mul_val1 + 56;
3021 
3022  res0 = res0 * tmp0;
3023  res1 = tmp2 * tmp0;
3024  res2 = tmp3 * tmp0;
3025  res0 += res3 * tmp1;
3026  res3 = tmp4 * tmp0;
3027  res1 += tmp5 * tmp1;
3028  res2 += tmp6 * tmp1;
3029  res3 += tmp7 * tmp1;
3030 
3031  res0 = __msa_srari_h(res0, 6);
3032  res1 = __msa_srari_h(res1, 6);
3033  res2 = __msa_srari_h(res2, 6);
3034  res3 = __msa_srari_h(res3, 6);
3035 
3036  vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
3037  vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
3038 
3039  ST_UB2(vec0, vec1, (left + 32), 16);
3040 
3041  left[63] = tmp1[0];
3042 
3043  top = filtered_top;
3044  } else {
3045  filtered_left[2 * 32 - 1] = left[2 * 32 - 1];
3046  filtered_top[2 * 32 - 1] = top[2 * 32 - 1];
3047  for (i = 2 * 32 - 2; i >= 0; i--)
3048  filtered_left[i] = (left[i + 1] + 2 * left[i] +
3049  left[i - 1] + 2) >> 2;
3050  filtered_top[-1] =
3051  filtered_left[-1] =
3052  (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
3053  for (i = 2 * 32 - 2; i >= 0; i--)
3054  filtered_top[i] = (top[i + 1] + 2 * top[i] +
3055  top[i - 1] + 2) >> 2;
3056  left = filtered_left;
3057  top = filtered_top;
3058  }
3059  }
3060  }
3061  }
3062 
3063  switch (mode) {
3064  case INTRA_PLANAR:
3065  s->hpc.pred_planar[3] ((uint8_t *) src, (uint8_t *) top,
3066  (uint8_t *) left, stride);
3067  break;
3068  case INTRA_DC:
3069  s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top,
3070  (uint8_t *) left, stride, 5, c_idx);
3071  break;
3072  default:
3073  s->hpc.pred_angular[3] ((uint8_t *) src, (uint8_t *) top,
3074  (uint8_t *) left, stride, c_idx, mode);
3075  break;
3076  }
3077 }
const HEVCPPS * pps
Definition: hevc_ps.h:335
void ff_hevc_intra_pred_planar_2_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
HEVCPredContext hpc
Definition: hevcdec.h:522
NeighbourAvailable na
Definition: hevcdec.h:456
HEVCFrame * ref
Definition: hevcdec.h:507
#define ILVRL_B2_SH(...)
static void hevc_intra_pred_horiz_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:190
MvField * tab_mvf
Definition: hevcdec.h:398
int vshift[3]
Definition: hevc_ps.h:241
#define LW(psrc)
#define MUL2(in0, in1, in2, in3, out0, out1)
#define SD
Definition: ccaption_dec.c:819
#define LD_SB(...)
#define ST_SB(...)
static const int8_t intra_pred_angle_up[17]
Definition: hevcpred_msa.c:25
#define ST_SB2(...)
HEVCParamSets ps
Definition: hevcdec.h:492
static void hevc_intra_pred_dc_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:401
#define SPLATI_H2_SH(...)
int stride
Definition: mace.c:144
static void hevc_intra_pred_vert_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:94
int width
Definition: hevc_ps.h:227
#define LD_SB2(...)
#define log2(x)
Definition: libm.h:404
int chroma_format_idc
Definition: hevc_ps.h:155
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
static void hevc_intra_pred_plane_32x32_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:908
uint8_t
#define ST_SH2(...)
#define UNPCK_UB_SH(in, out0, out1)
#define SLDI_B4_SH(...)
#define LD_UB2(...)
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define SRARI_H4_SH(...)
#define CLIP_SH_0_255(in)
#define SPLATI_H4_SH(...)
#define ILVRL_H2_SH(...)
void ff_hevc_intra_pred_dc_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int log2, int c_idx)
#define max(a, b)
Definition: cuda_runtime.h:33
#define CLIP_SH2_0_255(in0, in1)
#define PCKEV_B2_SB(...)
#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
#define U(x)
Definition: vp56_arith.h:37
#define src
Definition: vp8dsp.c:254
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:269
#define ILVRL_B2_UH(...)
int min_pu_height
Definition: hevc_ps.h:237
static const int8_t intra_pred_angle_low[16]
Definition: hevcpred_msa.c:29
static void hevc_intra_pred_horiz_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:224
static void hevc_intra_pred_angular_lower_16width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst, int32_t dst_stride)
#define zero
Definition: regdef.h:64
#define ILVR_B2_SH(...)
#define SW4(in0, in1, in2, in3, pdst, stride)
static const uint8_t offset[127][2]
Definition: vf_spp.c:93
static void hevc_intra_pred_angular_lower_4width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
static void hevc_intra_pred_horiz_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:264
int intra_pred_mode
Definition: hevcdec.h:375
const HEVCSPS * sps
Definition: hevc_ps.h:334
#define SRARI_H2_SH(...)
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
static void hevc_intra_pred_plane_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:663
#define PCKEV_D2_SH(...)
#define SLDI_B4_UB(...)
#define SRARI_H2_UH(...)
int hshift[3]
Definition: hevc_ps.h:240
static void hevc_intra_pred_plane_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:548
int32_t
#define PCKEV_B4_SB(...)
#define s(width, name)
Definition: cbs_vp9.c:257
static void hevc_intra_pred_angular_lower_32width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
int intra_pred_mode_c
Definition: hevcdec.h:376
int height
Definition: hevc_ps.h:228
#define ST_UB(...)
static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
#define HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, mul_val_h0, mul_val_h1, mul_val_h2, mul_val_h3, res0, res1, mul_val_b0, mul_val_b1, round)
Definition: hevcpred_msa.c:33
uint8_t constrained_intra_pred_flag
Definition: hevc_ps.h:260
int tb_mask
Definition: hevc_ps.h:238
#define ST_UB2(...)
#define ILVL_B2_SH(...)
#define ST_UB4(...)
IntraPredMode
Definition: hevcdec.h:173
static void hevc_intra_pred_horiz_32x32_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:310
#define ILVR_B2_UH(...)
#define src1
Definition: h264pred.c:139
void ff_hevc_intra_pred_planar_1_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
int linesize[AV_NUM_DATA_POINTERS]
For video, size in bytes of each picture line.
Definition: frame.h:331
unsigned int log2_min_pu_size
Definition: hevc_ps.h:210
static void hevc_intra_pred_angular_upper_32width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
TransformUnit tu
Definition: hevcdec.h:439
#define HADD_UB2_UH(...)
void(* pred_angular[4])(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int c_idx, int mode)
Definition: hevcpred.h:38
static void hevc_intra_pred_vert_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:149
#define ILVR_B4_SH(...)
void(* pred_dc)(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int log2_size, int c_idx)
Definition: hevcpred.h:36
void ff_intra_pred_8_32x32_msa(HEVCContext *s, int x0, int y0, int c_idx)
static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:595
AVFrame * frame
Definition: hevcdec.h:487
static void hevc_intra_pred_dc_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:340
unsigned int log2_min_tb_size
Definition: hevc_ps.h:207
#define src0
Definition: h264pred.c:138
#define ADD2(in0, in1, in2, in3, out0, out1)
#define LD(psrc)
#define INSERT_W2_SB(...)
static void hevc_intra_pred_vert_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:61
#define SD4(in0, in1, in2, in3, pdst, stride)
static void hevc_intra_pred_dc_32x32_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:516
void ff_hevc_intra_pred_planar_3_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
void ff_intra_pred_8_16x16_msa(HEVCContext *s, int x0, int y0, int c_idx)
#define SW(val, pdst)
HEVCLocalContext * HEVClc
Definition: hevcdec.h:474
#define ST_SB4(...)
static void process_intra_lower_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, uint8_t offset)
Definition: hevcpred_msa.c:826
void ff_pred_intra_pred_angular_1_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
static void hevc_intra_pred_angular_upper_16width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
uint8_t * data[AV_NUM_DATA_POINTERS]
pointer to the picture/channel planes.
Definition: frame.h:314
#define INSERT_D2_UB(...)
int
void ff_pred_intra_pred_angular_2_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
static int ref[MAX_W *MAX_W]
Definition: jpeg2000dwt.c:107
#define SUB2(in0, in1, in2, in3, out0, out1)
#define flag(name)
Definition: cbs_av1.c:568
static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
void ff_hevc_intra_pred_planar_0_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
uint8_t sps_strong_intra_smoothing_enable_flag
Definition: hevc_ps.h:203
#define LD_UB(...)
#define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
void ff_pred_intra_pred_angular_3_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
void ff_pred_intra_pred_angular_0_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
int min_pu_width
Definition: hevc_ps.h:236
static void hevc_intra_pred_dc_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:460
void(* pred_planar[4])(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride)
Definition: hevcpred.h:34
static void process_intra_upper_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, uint8_t offset)
Definition: hevcpred_msa.c:743
#define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
#define ST_W2(in, idx0, idx1, pdst, stride)
int * min_tb_addr_zs
MinTbAddrZS.
Definition: hevc_ps.h:320
int intra_smoothing_disabled_flag
Definition: hevc_ps.h:221
#define ILVR_D2_SH(...)
#define PCKEV_B2_UB(...)
static double val(void *priv, double ch)
Definition: aeval.c:76
mode
Use these values in ebur128_init (or&#39;ed).
Definition: ebur128.h:83
static void hevc_intra_pred_angular_upper_4width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
Definition: hevcpred_msa.c:923
#define SLDI_B4_SB(...)