--- /dev/null
+/**
+ * Copyright (C) 2010-2014 Freescale Semiconductor, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+.section .text
+
+.global hdmi_dma_copy_16_neon_lut
+.global hdmi_dma_copy_16_neon_fast
+.global hdmi_dma_copy_24_neon_lut
+.global hdmi_dma_copy_24_neon_fast
+
+
+/**
+ * hdmi_dma_copy_16_neon_lut
+ * Convert pcm sample to iec sample. Pcm sample is 16 bits.
+ * Frame index's between 0 and 47 inclusively. Channel count can be 1, 2, 4, 8.
+ * Frame count should be multipliable by 4, and Sample count by 8.
+ *
+ * C Prototype
+ * void hdmi_dma_copy_16_neon_lut(unsigned short *src, unsigned int *dst,
+ * int samples, unsigned char *lookup_table);
+ * Return value
+ * None
+ * Parameters
+ * src Source PCM16 samples
+ * dst Dest buffer to store pcm with header
+ * samples Contains sample count (=frame_count * channel_count)
+ * lookup_table Preconstructed header table. Channels interleaved.
+ */
+
+hdmi_dma_copy_16_neon_lut:
+ mov r12, #1 /* construct vector(1) */
+ vdup.8 d6, r12
+
+hdmi_dma_copy_16_neon_lut_start:
+
+ /* get 8 samples to q0 */
+ vld1.16 {d0, d1}, [r0]! /* TODO: aligned */
+
+ /* pld [r1, #(64*4)] */
+
+ /* xor every bit */
+ vcnt.8 q1, q0 /* count of 1s */
+ vpadd.i8 d2, d2, d3 /* only care about the LST in every element */
+ vand d2, d2, d6 /* clear other bits while keep the least bit */
+ vshl.u8 d2, d2, #3 /* bit p: d2 = d2 << 3 */
+
+ /* get packet header */
+ vld1.8 {d5}, [r3]!
+ veor d4, d5, d2 /* xor bit c */
+
+ /* store: (d4 << 16 | q0) << 8 */
+ vmovl.u8 q2, d4 /* expand from char to short */
+ vzip.16 q0, q2
+ vshl.u32 q0, q0, #8
+ vshl.u32 q1, q2, #8
+ vst1.32 {d0, d1, d2, d3}, [r1]!
+
+ /* decrease sample count */
+ subs r2, r2, #8
+ bne hdmi_dma_copy_16_neon_lut_start
+
+ mov pc, lr
+
+/**
+ * hdmi_dma_copy_16_neon_fast
+ * Convert pcm sample to iec sample. Pcm sample is 16 bits.
+ * Frame index's between 48 and 191 inclusively.
+ * Channel count can be 1, 2, 4 or 8.
+ * Frame count should be multipliable by 4, and Sample count by 8.
+ *
+ * C Prototype
+ * void hdmi_dma_copy_16_neon_fast(unsigned short *src,
+ * unsigned int *dst, int samples);
+ * Return value
+ * None
+ * Parameters
+ * src Source PCM16 samples
+ * dst Dest buffer to store pcm with header
+ * samples Contains sample count (=frame_count * channel_count)
+ */
+
+hdmi_dma_copy_16_neon_fast:
+ mov r12, #1 /* construct vector(1) */
+ vdup.8 d6, r12
+
+hdmi_dma_copy_16_neon_fast_start:
+ /* get 8 samples to q0 */
+ vld1.16 {d0, d1}, [r0]! /* TODO: aligned */
+
+ /* pld [r1, #(64*4)] */
+
+ /* xor every bit */
+ vcnt.8 q1, q0 /* count of 1s */
+ vpadd.i8 d2, d2, d3
+ vand d2, d2, d6 /* clear other bits while keep the LST */
+ /* finally we construct packet header */
+ vshl.u8 d4, d2, #3 /* bit p: d2 = d2 << 3 */
+
+ /* get packet header: always 0 */
+
+ /* store: (d4 << 16 | q0) << 8 */
+ vmovl.u8 q2, d4 /* expand from char to short */
+ vzip.16 q0, q2
+ vshl.u32 q0, q0, #8
+ vshl.u32 q1, q2, #8
+ vst1.32 {d0, d1, d2, d3}, [r1]!
+
+ /* decrease sample count */
+ subs r2, r2, #8
+ bne hdmi_dma_copy_16_neon_fast_start
+
+ mov pc, lr
+
+
+
+/**
+ * hdmi_dma_copy_24_neon_lut
+ * Convert pcm sample to iec sample. Pcm sample is 24 bits.
+ * Frame index's between 0 and 47 inclusively. Channel count can be 1, 2, 4, 8.
+ * Frame count should be multipliable by 4, and Sample count by 8.
+ *
+ * C Prototype
+ * void hdmi_dma_copy_24_neon_lut(unsigned int *src, unsigned int *dst,
+ * int samples, unsigned char *lookup_table);
+ * Return value
+ * None
+ * Parameters
+ * src Source PCM24 samples
+ * dst Dest buffer to store pcm with header
+ * samples Contains sample count (=frame_count * channel_count)
+ * lookup_table Preconstructed header table. Channels interleaved.
+ */
+
+hdmi_dma_copy_24_neon_lut:
+ vpush {d8}
+
+ mov r12, #1 /* construct vector(1) */
+ vdup.8 d8, r12
+
+hdmi_dma_copy_24_neon_lut_start:
+
+ /* get 8 samples to q0 and q1 */
+ vld1.32 {d0, d1, d2, d3}, [r0]! /* TODO: aligned */
+
+ /* pld [r1, #(64*4)] */
+
+ /* xor every bit */
+ vcnt.8 q2, q0 /* count of 1s */
+ vpadd.i8 d4, d4, d5 /* only care about the LSB in every element */
+ vcnt.8 q3, q1
+ vpadd.i8 d6, d6, d7
+ vpadd.i8 d4, d4, d6 /* d4: contains xor result and other dirty bits */
+ vand d4, d4, d8 /* clear other bits while keep the least bit */
+ vshl.u8 d4, d4, #3 /* bit p: d4 = d4 << 3 */
+
+ /* get packet header */
+ vld1.8 {d5}, [r3]!/* d5: original header */
+ veor d5, d5, d4 /* fix bit p */
+
+ /* store: (d5 << 24 | q0) */
+ vmovl.u8 q3, d5 /* expand from char to short */
+ vmovl.u16 q2, d6 /* expand from short to int */
+ vmovl.u16 q3, d7
+ vshl.u32 q2, q2, #24
+ vshl.u32 q3, q3, #24
+ vorr q0, q0, q2
+ vorr q1, q1, q3
+ vst1.32 {d0, d1, d2, d3}, [r1]!
+
+ /* decrease sample count */
+ subs r2, r2, #8
+ bne hdmi_dma_copy_24_neon_lut_start
+
+ vpop {d8}
+ mov pc, lr
+
+/**
+ * hdmi_dma_copy_24_neon_fast
+ * Convert pcm sample to iec sample. Pcm sample is 24 bits.
+ * Frame index's between 48 and 191 inclusively.
+ * Channel count can be 1, 2, 4 or 8.
+ * Frame count should be multipliable by 4, and Sample count by 8.
+ *
+ * C Prototype
+ * void hdmi_dma_copy_24_neon_fast(unsigned int *src,
+ * unsigned int *dst, int samples);
+ * Return value
+ * None
+ * Parameters
+ * src Source PCM24 samples
+ * dst Dest buffer to store pcm with header
+ * samples Contains sample count (=frame_count * channel_count)
+ */
+
+hdmi_dma_copy_24_neon_fast:
+ vpush {d8}
+
+ mov r12, #1 /* construct vector(1) */
+ vdup.8 d8, r12
+
+hdmi_dma_copy_24_neon_fast_start:
+ /* get 8 samples to q0 and q1 */
+ vld1.32 {d0, d1, d2, d3}, [r0]! /* TODO: aligned */
+
+ /* pld [r1, #(64*4)] */
+
+ /* xor every bit */
+ vcnt.8 q2, q0 /* count of 1s */
+ vpadd.i8 d4, d4, d5 /* only care about the LSB in every element */
+ vcnt.8 q3, q1
+ vpadd.i8 d6, d6, d7
+ vpadd.i8 d4, d4, d6 /* d4: contains xor result and other dirty bits */
+ vand d4, d4, d8 /* clear other bits while keep the least bit */
+ vshl.u8 d4, d4, #3 /* bit p: d4 = d4 << 3 */
+
+ /* store: (d4 << 24 | q0) */
+ vmovl.u8 q3, d4 /* expand from char to short */
+ vmovl.u16 q2, d6 /* expand from short to int */
+ vmovl.u16 q3, d7
+ vshl.u32 q2, q2, #24
+ vshl.u32 q3, q3, #24
+ vorr q0, q0, q2
+ vorr q1, q1, q3
+ vst1.32 {d0, d1, d2, d3}, [r1]!
+
+ /* decrease sample count */
+ subs r2, r2, #8
+ bne hdmi_dma_copy_24_neon_fast_start
+
+ vpop {d8}
+ mov pc, lr
/* max 8 channels supported; channels are interleaved */
static u8 g_packet_head_table[48 * 8];
+void hdmi_dma_copy_16_neon_lut(unsigned short *src, unsigned int *dst,
+ int samples, unsigned char *lookup_table);
+void hdmi_dma_copy_16_neon_fast(unsigned short *src, unsigned int *dst,
+ int samples);
+void hdmi_dma_copy_24_neon_lut(unsigned int *src, unsigned int *dst,
+ int samples, unsigned char *lookup_table);
+void hdmi_dma_copy_24_neon_fast(unsigned int *src, unsigned int *dst,
+ int samples);
+static void hdmi_dma_irq_enable(struct hdmi_dma_priv *priv);
+static void hdmi_dma_irq_disable(struct hdmi_dma_priv *priv);
+
union hdmi_audio_header_t iec_header;
EXPORT_SYMBOL(iec_header);
}
}
+#ifdef HDMI_DMA_NO_NEON
/* Optimization for IEC head */
static void hdmi_dma_copy_16_c_lut(u16 *src, u32 *dst, int samples,
u8 *lookup_table)
dst += samples;
}
}
+#else
+/* NEON optimization for IEC head*/
+
+/**
+ * Convert pcm samples to iec samples suitable for HDMI transfer.
+ * PCM sample is 16 bits length.
+ * Frame index always starts from 0.
+ * Channel count can be 1, 2, 4, 6, or 8
+ * Sample count (frame_count * channel_count) is multipliable by 8.
+ */
+static void hdmi_dma_copy_16(u16 *src, u32 *dst, int framecount, int channelcount)
+{
+ /* split input frames into 192-frame each */
+ int i, count_in_192 = (framecount + 191) / 192;
+
+ for (i = 0; i < count_in_192; i++) {
+ int count, samples;
+
+ /* handles frame index [0, 48) */
+ count = (framecount < 48) ? framecount : 48;
+ samples = count * channelcount;
+ hdmi_dma_copy_16_neon_lut(src, dst, samples, g_packet_head_table);
+ framecount -= count;
+ if (framecount == 0)
+ break;
+
+ src += samples;
+ dst += samples;
+
+ /* handles frame index [48, 192) */
+ count = (framecount < 192 - 48) ? framecount : (192 - 48);
+ samples = count * channelcount;
+ hdmi_dma_copy_16_neon_fast(src, dst, samples);
+ framecount -= count;
+ src += samples;
+ dst += samples;
+ }
+}
+#endif
static void hdmi_dma_mmap_copy(struct snd_pcm_substream *substream,
int offset, int count)