[PATCH dvb-apps] Fix scan handling for EN 300468 charsets

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



It always bothered me that scan tables produced in Brazil
were full of "reverse interrogation" character, due to
accents, like:
	Rïdio
instead of:
	RÃdio

This is due to the lack of properly interpreting the charset
and calling iconv to handle it.

If, for any reason, the iconv can't convert the charset, it falls
back to the old behavior.

Yet, according with the EN 300 468 Annex A, if the first character
is >= 0x20 (space), the parser should assume character set
ISO-6937-2 (in fact, the standard defines a variant, with an 
extra euro character).

However, at least here, the DVB-C operator doesn't honour the
charset table selection.

Due to that, a new option were added to scan, allowing to override
the charset default. So, in the case of Brazil, using a:
	-C ISO-8859-1
parameter, will properly produce the right channel descriptions, like:

Mosaico InformaÃÃo:651000000:INVERSION_AUTO:5217000:FEC_3_4:QAM_256:3664:3665:401

Signed-off-by: Mauro Carvalho Chehab <mchehab@xxxxxxxxxx>

diff -r f4e015ebdac7 util/scan/scan.c
--- a/util/scan/scan.c	Thu Mar 17 14:46:30 2011 +0100
+++ b/util/scan/scan.c	Wed Mar 30 21:58:29 2011 -0300
@@ -33,6 +33,7 @@
 #include <assert.h>
 #include <glob.h>
 #include <ctype.h>
+#include <iconv.h>
 
 #include <linux/dvb/frontend.h>
 #include <linux/dvb/dmx.h>
@@ -66,6 +67,7 @@
 static int vdr_version = 3;
 static struct lnb_types_st lnb_type;
 static int unique_anon_services;
+char *default_charset = "ISO-6937-2";
 
 static enum fe_spectral_inversion spectral_inversion = INVERSION_AUTO;
 
@@ -543,60 +545,113 @@
 	}
 }
 
+/*
+ * handle character set correctly (e.g. via iconv)
+ *   c.f. EN 300 468 annex A 
+ */
+static void descriptorcpy(char **dest, const unsigned char *src, size_t len)
+{
+	size_t destlen, i;
+	char *p, *type = NULL;
+
+	if (*dest) {
+		free (*dest);
+		*dest = NULL;
+	}
+	if (!len)
+		return;
+
+	if (*src < 0x20) {
+		switch (*src) {
+		case 0x00:	type = "ISO-6937-2";		break;
+		case 0x01:	type = "ISO-8859-5";		break;
+		case 0x02:	type = "ISO-8859-6";		break;
+		case 0x03:	type = "ISO-8859-7";		break;
+		case 0x04:	type = "ISO-8859-8";		break;
+		case 0x05:	type = "ISO-8859-9";		break;
+		case 0x06:	type = "ISO-8859-10";		break;
+		case 0x07:	type = "ISO-8859-11";		break;
+		case 0x09:	type = "ISO-8859-13";		break;
+		case 0x0a:	type = "ISO-8859-14";		break;
+		case 0x0b:	type = "ISO-8859-15";		break;
+		case 0x11:	type = "ISO-10646";		break;
+		case 0x12:	type = "ISO-2022-KR";		break;
+		case 0x13:	type = "GB2312";		break;
+		case 0x14:	type = "BIG5";			break;
+		case 0x15:	type = "ISO-10646/UTF-8";	break;
+		case 0x10: /* ISO8859 */
+			if ((*(src + 1) != 0) || *(src + 2) > 0x0f)
+				break;
+			src+=2;
+			len-=2;
+			switch(*src) {
+			case 0x01:	type = "ISO-8859-1";		break;
+			case 0x02:	type = "ISO-8859-2";		break;
+			case 0x03:	type = "ISO-8859-3";		break;
+			case 0x04:	type = "ISO-8859-4";		break;
+			case 0x05:	type = "ISO-8859-5";		break;
+			case 0x06:	type = "ISO-8859-6";		break;
+			case 0x07:	type = "ISO-8859-7";		break;
+			case 0x08:	type = "ISO-8859-8";		break;
+			case 0x09:	type = "ISO-8859-9";		break;
+			case 0x0a:	type = "ISO-8859-10";		break;
+			case 0x0b:	type = "ISO-8859-11";		break;
+			case 0x0d:	type = "ISO-8859-13";		break;
+			case 0x0e:	type = "ISO-8859-14";		break;
+			case 0x0f:	type = "ISO-8859-15";		break;
+			}
+		}
+		src++;
+		len--;
+	} else {
+		type = default_charset;
+	}
+
+	/* Destin length should be bigger, to allow 2 char sequences */
+	destlen = len * 2;
+	*dest = malloc(destlen + 1);
+	p = *dest;
+
+	if (type) {
+		iconv_t cd = iconv_open("UTF-8//IGNORE", type);
+		if (cd == (iconv_t)(-1)) {
+			type = NULL;
+		} else {
+			iconv(cd, (char **)&src, &len, &p, &destlen);
+			iconv_close(cd);
+			*p = '\0';
+
+			return;
+		}
+	}
+
+	/* Fallback method: just output whatever non-control char */
+	p = *dest;
+	if (!type) {
+		for (i = 0; i < len; i++, src++) {
+			if (*src >= 0x20 && (*src < 0x80 || *src > 0x9f))
+				*p++ = *src;
+
+		}
+	}
+	*p = '\0';
+}
+
 static void parse_service_descriptor (const unsigned char *buf, struct service *s)
 {
 	unsigned char len;
-	unsigned char *src, *dest;
 
 	s->type = buf[2];
 
 	buf += 3;
 	len = *buf;
 	buf++;
-
-	if (s->provider_name)
-		free (s->provider_name);
-
-	s->provider_name = malloc (len + 1);
-	memcpy (s->provider_name, buf, len);
-	s->provider_name[len] = '\0';
-
-	/* remove control characters (FIXME: handle short/long name) */
-	/* FIXME: handle character set correctly (e.g. via iconv)
-	 * c.f. EN 300 468 annex A */
-	for (src = dest = (unsigned char *) s->provider_name; *src; src++)
-		if (*src >= 0x20 && (*src < 0x80 || *src > 0x9f))
-			*dest++ = *src;
-	*dest = '\0';
-	if (!s->provider_name[0]) {
-		/* zap zero length names */
-		free (s->provider_name);
-		s->provider_name = 0;
-	}
-
-	if (s->service_name)
-		free (s->service_name);
+	descriptorcpy(&s->provider_name, buf, len);
 
 	buf += len;
 	len = *buf;
 	buf++;
-
-	s->service_name = malloc (len + 1);
-	memcpy (s->service_name, buf, len);
-	s->service_name[len] = '\0';
-
-	/* remove control characters (FIXME: handle short/long name) */
-	/* FIXME: handle character set correctly (e.g. via iconv)
-	 * c.f. EN 300 468 annex A */
-	for (src = dest = (unsigned char *) s->service_name; *src; src++)
-		if (*src >= 0x20 && (*src < 0x80 || *src > 0x9f))
-			*dest++ = *src;
-	*dest = '\0';
-	if (!s->service_name[0]) {
-		/* zap zero length names */
-		free (s->service_name);
-		s->service_name = 0;
-	}
+	descriptorcpy(&s->service_name, buf, len);
 
 	info("0x%04x 0x%04x: pmt_pid 0x%04x %s -- %s (%s%s)\n",
 	    s->transport_stream_id,
@@ -2118,7 +2173,8 @@
 	"	-P do not use ATSC PSIP tables for scanning\n"
 	"	    (but only PAT and PMT) (applies for ATSC only)\n"
 	"	-A N	check for ATSC 1=Terrestrial [default], 2=Cable or 3=both\n"
-	"	-U	Uniquely name unknown services\n";
+	"	-U	Uniquely name unknown services\n"
+	"	-C cs	Override default charset (default = ISO-6937-2)\n";
 
 void
 bad_usage(char *pname, int problem)
@@ -2166,7 +2222,7 @@
 
 	/* start with default lnb type */
 	lnb_type = *lnb_enum(0);
-	while ((opt = getopt(argc, argv, "5cnpa:f:d:s:o:x:e:t:i:l:vquPA:U")) != -1) {
+	while ((opt = getopt(argc, argv, "5cnpa:f:d:s:o:x:e:t:i:l:vquPA:UC:")) != -1) {
 		switch (opt) {
 		case 'a':
 			adapter = strtoul(optarg, NULL, 0);
@@ -2246,6 +2302,9 @@
 		case 'U':
 			unique_anon_services = 1;
 			break;
+		case 'C':
+			default_charset = optarg;
+			break;
 		default:
 			bad_usage(argv[0], 0);
 			return -1;
--
To unsubscribe from this list: send the line "unsubscribe linux-media" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Linux Input]     [Video for Linux]     [Gstreamer Embedded]     [Mplayer Users]     [Linux USB Devel]     [Linux Audio Users]     [Linux Kernel]     [Linux SCSI]     [Yosemite Backpacking]
  Powered by Linux