summaryrefslogtreecommitdiff
path: root/includes/media/IPTC.php
diff options
context:
space:
mode:
Diffstat (limited to 'includes/media/IPTC.php')
-rw-r--r--includes/media/IPTC.php576
1 files changed, 576 insertions, 0 deletions
diff --git a/includes/media/IPTC.php b/includes/media/IPTC.php
new file mode 100644
index 00000000..1d19791c
--- /dev/null
+++ b/includes/media/IPTC.php
@@ -0,0 +1,576 @@
+<?php
+/**
+*Class for some IPTC functions.
+
+*/
+class IPTC {
+
+ /**
+ * This takes the results of iptcparse() and puts it into a
+ * form that can be handled by mediawiki. Generally called from
+ * BitmapMetadataHandler::doApp13.
+ *
+ * @see http://www.iptc.org/std/IIM/4.1/specification/IIMV4.1.pdf
+ *
+ * @param $rawData String app13 block from jpeg containing iptc/iim data
+ * @return Array iptc metadata array
+ */
+ static function parse( $rawData ) {
+ $parsed = iptcparse( $rawData );
+ $data = Array();
+ if (!is_array($parsed)) {
+ return $data;
+ }
+
+ $c = '';
+ //charset info contained in tag 1:90.
+ if (isset($parsed['1#090']) && isset($parsed['1#090'][0])) {
+ $c = self::getCharset($parsed['1#090'][0]);
+ if ($c === false) {
+ //Unknown charset. refuse to parse.
+ //note: There is a different between
+ //unknown and no charset specified.
+ return array();
+ }
+ unset( $parsed['1#090'] );
+ }
+
+ foreach ( $parsed as $tag => $val ) {
+ if ( isset( $val[0] ) && trim($val[0]) == '' ) {
+ wfDebugLog('iptc', "IPTC tag $tag had only whitespace as its value.");
+ continue;
+ }
+ switch( $tag ) {
+ case '2#120': /*IPTC caption. mapped with exif ImageDescription*/
+ $data['ImageDescription'] = self::convIPTC( $val, $c );
+ break;
+ case '2#116': /* copyright. Mapped with exif copyright */
+ $data['Copyright'] = self::convIPTC( $val, $c );
+ break;
+ case '2#080': /* byline. Mapped with exif Artist */
+ /* merge with byline title (2:85)
+ * like how exif does it with
+ * Title, person. Not sure if this is best
+ * approach since we no longer have the two fields
+ * separate. each byline title entry corresponds to a
+ * specific byline. */
+
+ $bylines = self::convIPTC( $val, $c );
+ if ( isset( $parsed['2#085'] ) ) {
+ $titles = self::convIPTC( $parsed['2#085'], $c );
+ } else {
+ $titles = array();
+ }
+
+ for ( $i = 0; $i < count( $titles ); $i++ ) {
+ if ( isset( $bylines[$i] ) ) {
+ // theoretically this should always be set
+ // but doesn't hurt to be careful.
+ $bylines[$i] = $titles[$i] . ', ' . $bylines[$i];
+ }
+ }
+ $data['Artist'] = $bylines;
+ break;
+ case '2#025': /* keywords */
+ $data['Keywords'] = self::convIPTC( $val, $c );
+ break;
+ case '2#101': /* Country (shown)*/
+ $data['CountryDest'] = self::convIPTC( $val, $c );
+ break;
+ case '2#095': /* state/province (shown) */
+ $data['ProvinceOrStateDest'] = self::convIPTC( $val, $c );
+ break;
+ case '2#090': /* city (Shown) */
+ $data['CityDest'] = self::convIPTC( $val, $c );
+ break;
+ case '2#092': /* sublocation (shown) */
+ $data['SublocationDest'] = self::convIPTC( $val, $c );
+ break;
+ case '2#005': /* object name/title */
+ $data['ObjectName'] = self::convIPTC( $val, $c );
+ break;
+ case '2#040': /* special instructions */
+ $data['SpecialInstructions'] = self::convIPTC( $val, $c );
+ break;
+ case '2#105': /* headline*/
+ $data['Headline'] = self::convIPTC( $val, $c );
+ break;
+ case '2#110': /* credit */
+ /*"Identifies the provider of the objectdata,
+ * not necessarily the owner/creator". */
+ $data['Credit'] = self::convIPTC( $val, $c );
+ break;
+ case '2#115': /* source */
+ /* "Identifies the original owner of the intellectual content of the
+ *objectdata. This could be an agency, a member of an agency or
+ *an individual." */
+ $data['Source'] = self::convIPTC( $val, $c );
+ break;
+
+ case '2#007': /* edit status (lead, correction, etc) */
+ $data['EditStatus'] = self::convIPTC( $val, $c );
+ break;
+ case '2#015': /* category. deprecated. max 3 letters in theory, often more */
+ $data['iimCategory'] = self::convIPTC( $val, $c );
+ break;
+ case '2#020': /* category. deprecated. */
+ $data['iimSupplementalCategory'] = self::convIPTC( $val, $c );
+ break;
+ case '2#010': /*urgency (1-8. 1 most, 5 normal, 8 low priority)*/
+ $data['Urgency'] = self::convIPTC( $val, $c );
+ break;
+ case '2#022':
+ /* "Identifies objectdata that recurs often and predictably...
+ * Example: Euroweather" */
+ $data['FixtureIdentifier'] = self::convIPTC( $val, $c );
+ break;
+ case '2#026':
+ /* Content location code (iso 3166 + some custom things)
+ * ex: TUR (for turkey), XUN (for UN), XSP (outer space)
+ * See wikipedia article on iso 3166 and appendix D of iim std. */
+ $data['LocationDestCode'] = self::convIPTC( $val, $c );
+ break;
+ case '2#027':
+ /* Content location name. Full printable name
+ * of location of photo. */
+ $data['LocationDest'] = self::convIPTC( $val, $c );
+ break;
+ case '2#065':
+ /* Originating Program.
+ * Combine with Program version (2:70) if present.
+ */
+ $software = self::convIPTC( $val, $c );
+
+ if ( count( $software ) !== 1 ) {
+ //according to iim standard this cannot have multiple values
+ //so if there is more than one, something weird is happening,
+ //and we skip it.
+ wfDebugLog( 'iptc', 'IPTC: Wrong count on 2:65 Software field' );
+ break;
+ }
+
+ if ( isset( $parsed['2#070'] ) ) {
+ //if a version is set for the software.
+ $softwareVersion = self::convIPTC( $parsed['2#070'], $c );
+ unset($parsed['2#070']);
+ $data['Software'] = array( array( $software[0], $softwareVersion[0] ) );
+ } else {
+ $data['Software'] = $software;
+ }
+ break;
+ case '2#075':
+ /* Object cycle.
+ * a for morning (am), p for evening, b for both */
+ $data['ObjectCycle'] = self::convIPTC( $val, $c );
+ break;
+ case '2#100':
+ /* Country/Primary location code.
+ * "Indicates the code of the country/primary location where the
+ * intellectual property of the objectdata was created"
+ * unclear how this differs from 2#026
+ */
+ $data['CountryCodeDest'] = self::convIPTC( $val, $c );
+ break;
+ case '2#103':
+ /* original transmission ref.
+ * "A code representing the location of original transmission ac-
+ * cording to practises of the provider."
+ */
+ $data['OriginalTransmissionRef'] = self::convIPTC( $val, $c );
+ break;
+ case '2#118': /*contact*/
+ $data['Contact'] = self::convIPTC( $val, $c );
+ break;
+ case '2#122':
+ /* Writer/Editor
+ * "Identification of the name of the person involved in the writing,
+ * editing or correcting the objectdata or caption/abstract."
+ */
+ $data['Writer'] = self::convIPTC( $val, $c );
+ break;
+ case '2#135': /* lang code */
+ $data['LanguageCode'] = self::convIPTC( $val, $c );
+ break;
+
+ // Start date stuff.
+ // It doesn't accept incomplete dates even though they are valid
+ // according to spec.
+ // Should potentially store timezone as well.
+ case '2#055':
+ //Date created (not date digitized).
+ //Maps to exif DateTimeOriginal
+ if ( isset( $parsed['2#060'] ) ) {
+ $time = $parsed['2#060'];
+ } else {
+ $time = Array();
+ }
+ $timestamp = self::timeHelper( $val, $time, $c );
+ if ($timestamp) {
+ $data['DateTimeOriginal'] = $timestamp;
+ }
+ break;
+
+ case '2#062':
+ //Date converted to digital representation.
+ //Maps to exif DateTimeDigitized
+ if ( isset( $parsed['2#063'] ) ) {
+ $time = $parsed['2#063'];
+ } else {
+ $time = Array();
+ }
+ $timestamp = self::timeHelper( $val, $time, $c );
+ if ($timestamp) {
+ $data['DateTimeDigitized'] = $timestamp;
+ }
+ break;
+
+ case '2#030':
+ //Date released.
+ if ( isset( $parsed['2#035'] ) ) {
+ $time = $parsed['2#035'];
+ } else {
+ $time = Array();
+ }
+ $timestamp = self::timeHelper( $val, $time, $c );
+ if ($timestamp) {
+ $data['DateTimeReleased'] = $timestamp;
+ }
+ break;
+
+ case '2#037':
+ //Date expires.
+ if ( isset( $parsed['2#038'] ) ) {
+ $time = $parsed['2#038'];
+ } else {
+ $time = Array();
+ }
+ $timestamp = self::timeHelper( $val, $time, $c );
+ if ($timestamp) {
+ $data['DateTimeExpires'] = $timestamp;
+ }
+ break;
+
+ case '2#000': /* iim version */
+ // unlike other tags, this is a 2-byte binary number.
+ //technically this is required if there is iptc data
+ //but in practise it isn't always there.
+ if ( strlen( $val[0] ) == 2 ) {
+ //if is just to be paranoid.
+ $versionValue = ord( substr( $val[0], 0, 1 ) ) * 256;
+ $versionValue += ord( substr( $val[0], 1, 1 ) );
+ $data['iimVersion'] = $versionValue;
+ }
+ break;
+
+ case '2#004':
+ // IntellectualGenere.
+ // first 4 characters are an id code
+ // That we're not really interested in.
+
+ // This prop is weird, since it's
+ // allowed to have multiple values
+ // in iim 4.1, but not in the XMP
+ // stuff. We're going to just
+ // extract the first value.
+ $con = self::ConvIPTC( $val, $c );
+ if ( strlen( $con[0] ) < 5 ) {
+ wfDebugLog( 'iptc', 'IPTC: '
+ . '2:04 too short. '
+ . 'Ignoring.' );
+ break;
+ }
+ $extracted = substr( $con[0], 4 );
+ $data['IntellectualGenre'] = $extracted;
+ break;
+
+ case '2#012':
+ // Subject News code - this is a compound field
+ // at the moment we only extract the subject news
+ // code, which is an 8 digit (ascii) number
+ // describing the subject matter of the content.
+ $codes = self::convIPTC( $val, $c );
+ foreach ( $codes as $ic ) {
+ $fields = explode(':', $ic, 3 );
+
+ if ( count( $fields ) < 2 ||
+ $fields[0] !== 'IPTC' )
+ {
+ wfDebugLog( 'IPTC', 'IPTC: '
+ . 'Invalid 2:12 - ' . $ic );
+ break;
+ }
+ $data['SubjectNewsCode'] = $fields[1];
+ }
+ break;
+
+ // purposely does not do 2:125, 2:130, 2:131,
+ // 2:47, 2:50, 2:45, 2:42, 2:8, 2:3
+ // 2:200, 2:201, 2:202
+ // or the audio stuff (2:150 to 2:154)
+
+ case '2#070':
+ case '2#060':
+ case '2#063':
+ case '2#085':
+ case '2#038':
+ case '2#035':
+ //ignore. Handled elsewhere.
+ break;
+
+ default:
+ wfDebugLog( 'iptc', "Unsupported iptc tag: $tag. Value: " . implode( ',', $val ));
+ break;
+ }
+
+ }
+ return $data;
+ }
+
+ /**
+ * Convert an iptc date and time tags into the exif format
+ *
+ * @todo Potentially this should also capture the timezone offset.
+ * @param Array $date The date tag
+ * @param Array $time The time tag
+ * @param $c
+ * @return String Date in exif format.
+ */
+ private static function timeHelper( $date, $time, $c ) {
+ if ( count( $date ) === 1 ) {
+ //the standard says this should always be 1
+ //just double checking.
+ list($date) = self::convIPTC( $date, $c );
+ } else {
+ return null;
+ }
+
+ if ( count( $time ) === 1 ) {
+ list($time) = self::convIPTC( $time, $c );
+ $dateOnly = false;
+ } else {
+ $time = '000000+0000'; //placeholder
+ $dateOnly = true;
+ }
+
+ if ( ! ( preg_match('/\d\d\d\d\d\d[-+]\d\d\d\d/', $time)
+ && preg_match('/\d\d\d\d\d\d\d\d/', $date)
+ && substr($date, 0, 4) !== '0000'
+ && substr($date, 4, 2) !== '00'
+ && substr($date, 6, 2) !== '00'
+ ) ) {
+ //something wrong.
+ // Note, this rejects some valid dates according to iptc spec
+ // for example: the date 00000400 means the photo was taken in
+ // April, but the year and day is unknown. We don't process these
+ // types of incomplete dates atm.
+ wfDebugLog( 'iptc', "IPTC: invalid time ( $time ) or date ( $date )");
+ return null;
+ }
+
+ $unixTS = wfTimestamp( TS_UNIX, $date . substr( $time, 0, 6 ));
+ if ( $unixTS === false ) {
+ wfDebugLog( 'iptc', "IPTC: can't convert date to TS_UNIX: $date $time." );
+ return null;
+ }
+
+ $tz = ( intval( substr( $time, 7, 2 ) ) *60*60 )
+ + ( intval( substr( $time, 9, 2 ) ) * 60 );
+
+ if ( substr( $time, 6, 1 ) === '-' ) {
+ $tz = - $tz;
+ }
+
+ $finalTimestamp = wfTimestamp( TS_EXIF, $unixTS + $tz );
+ if ( $finalTimestamp === false ) {
+ wfDebugLog( 'iptc', "IPTC: can't make final timestamp. Date: " . ( $unixTS + $tz ) );
+ return null;
+ }
+ if ( $dateOnly ) {
+ //return the date only
+ return substr( $finalTimestamp, 0, 10 );
+ } else {
+ return $finalTimestamp;
+ }
+ }
+
+ /**
+ * Helper function to convert charset for iptc values.
+ * @param $data Mixed String or Array: The iptc string
+ * @param $charset String: The charset
+ *
+ * @return string
+ */
+ private static function convIPTC ( $data, $charset ) {
+ if ( is_array( $data ) ) {
+ foreach ($data as &$val) {
+ $val = self::convIPTCHelper( $val, $charset );
+ }
+ } else {
+ $data = self::convIPTCHelper( $data, $charset );
+ }
+
+ return $data;
+ }
+ /**
+ * Helper function of a helper function to convert charset for iptc values.
+ * @param $data Mixed String or Array: The iptc string
+ * @param $charset String: The charset
+ *
+ * @return string
+ */
+ private static function convIPTCHelper ( $data, $charset ) {
+ if ( $charset ) {
+ wfSuppressWarnings();
+ $data = iconv($charset, "UTF-8//IGNORE", $data);
+ wfRestoreWarnings();
+ if ($data === false) {
+ $data = "";
+ wfDebugLog('iptc', __METHOD__ . " Error converting iptc data charset $charset to utf-8");
+ }
+ } else {
+ //treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252
+ // most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8
+ $oldData = $data;
+ UtfNormal::quickIsNFCVerify( $data ); //make $data valid utf-8
+ if ($data === $oldData) {
+ return $data; //if validation didn't change $data
+ } else {
+ return self::convIPTCHelper( $oldData, 'Windows-1252' );
+ }
+ }
+ return trim( $data );
+ }
+
+ /**
+ * take the value of 1:90 tag and returns a charset
+ * @param String $tag 1:90 tag.
+ * @return string charset name or "?"
+ * Warning, this function does not (and is not intended to) detect
+ * all iso 2022 escape codes. In practise, the code for utf-8 is the
+ * only code that seems to have wide use. It does detect that code.
+ */
+ static function getCharset($tag) {
+
+ //According to iim standard, charset is defined by the tag 1:90.
+ //in which there are iso 2022 escape sequences to specify the character set.
+ //the iim standard seems to encourage that all necessary escape sequences are
+ //in the 1:90 tag, but says it doesn't have to be.
+
+ //This is in need of more testing probably. This is definitely not complete.
+ //however reading the docs of some other iptc software, it appears that most iptc software
+ //only recognizes utf-8. If 1:90 tag is not present content is
+ // usually ascii or iso-8859-1 (and sometimes utf-8), but no guarantee.
+
+ //This also won't work if there are more than one escape sequence in the 1:90 tag
+ //or if something is put in the G2, or G3 charsets, etc. It will only reliably recognize utf-8.
+
+ // This is just going through the charsets mentioned in appendix C of the iim standard.
+
+ // \x1b = ESC.
+ switch ( $tag ) {
+ case "\x1b%G": //utf-8
+ //Also call things that are compatible with utf-8, utf-8 (e.g. ascii)
+ case "\x1b(B": // ascii
+ case "\x1b(@": // iso-646-IRV (ascii in latest version, $ different in older version)
+ $c = 'UTF-8';
+ break;
+ case "\x1b(A": //like ascii, but british.
+ $c = 'ISO646-GB';
+ break;
+ case "\x1b(C": //some obscure sweedish/finland encoding
+ $c = 'ISO-IR-8-1';
+ break;
+ case "\x1b(D":
+ $c = 'ISO-IR-8-2';
+ break;
+ case "\x1b(E": //some obscure danish/norway encoding
+ $c = 'ISO-IR-9-1';
+ break;
+ case "\x1b(F":
+ $c = 'ISO-IR-9-2';
+ break;
+ case "\x1b(G":
+ $c = 'SEN_850200_B'; // aka iso 646-SE; ascii-like
+ break;
+ case "\x1b(I":
+ $c = "ISO646-IT";
+ break;
+ case "\x1b(L":
+ $c = "ISO646-PT";
+ break;
+ case "\x1b(Z":
+ $c = "ISO646-ES";
+ break;
+ case "\x1b([":
+ $c = "GREEK7-OLD";
+ break;
+ case "\x1b(K":
+ $c = "ISO646-DE";
+ break;
+ case "\x1b(N": //crylic
+ $c = "ISO_5427";
+ break;
+ case "\x1b(`": //iso646-NO
+ $c = "NS_4551-1";
+ break;
+ case "\x1b(f": //iso646-FR
+ $c = "NF_Z_62-010";
+ break;
+ case "\x1b(g":
+ $c = "PT2"; //iso646-PT2
+ break;
+ case "\x1b(h":
+ $c = "ES2";
+ break;
+ case "\x1b(i": //iso646-HU
+ $c = "MSZ_7795.3";
+ break;
+ case "\x1b(w":
+ $c = "CSA_Z243.4-1985-1";
+ break;
+ case "\x1b(x":
+ $c = "CSA_Z243.4-1985-2";
+ break;
+ case "\x1b\$(B":
+ case "\x1b\$B":
+ case "\x1b&@\x1b\$B":
+ case "\x1b&@\x1b\$(B":
+ $c = "JIS_C6226-1983";
+ break;
+ case "\x1b-A": // iso-8859-1. at least for the high code characters.
+ case "\x1b(@\x1b-A":
+ case "\x1b(B\x1b-A":
+ $c = 'ISO-8859-1';
+ break;
+ case "\x1b-B": // iso-8859-2. at least for the high code characters.
+ $c = 'ISO-8859-2';
+ break;
+ case "\x1b-C": // iso-8859-3. at least for the high code characters.
+ $c = 'ISO-8859-3';
+ break;
+ case "\x1b-D": // iso-8859-4. at least for the high code characters.
+ $c = 'ISO-8859-4';
+ break;
+ case "\x1b-E": // iso-8859-5. at least for the high code characters.
+ $c = 'ISO-8859-5';
+ break;
+ case "\x1b-F": // iso-8859-6. at least for the high code characters.
+ $c = 'ISO-8859-6';
+ break;
+ case "\x1b-G": // iso-8859-7. at least for the high code characters.
+ $c = 'ISO-8859-7';
+ break;
+ case "\x1b-H": // iso-8859-8. at least for the high code characters.
+ $c = 'ISO-8859-8';
+ break;
+ case "\x1b-I": // CSN_369103. at least for the high code characters.
+ $c = 'CSN_369103';
+ break;
+ default:
+ wfDebugLog('iptc', __METHOD__ . 'Unknown charset in iptc 1:90: ' . bin2hex( $tag ) );
+ //at this point just give up and refuse to parse iptc?
+ $c = false;
+ }
+ return $c;
+ }
+}