"""
articles module (imdb package).
This module provides functions and data to handle in a smart way
articles (in various languages) at the beginning of movie titles.
Copyright 2009 Davide Alberani <da@erlug.linux.it>
2009 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
# List of generic articles in many languages.
# XXX: Managing titles in a lot of different languages, a function to recognize
# an initial article can't be perfect; sometimes we'll stumble upon a short
# word that is an article in some language, but it's not in another; in these
# situations we have to choose if we want to interpret this little word
# as an article or not (remember that we don't know what the original language
# of the title was).
# Example: 'da' is an article in (I think) Dutch and it's used as an article
# even in some American slangs. Unfortunately it's also a preposition in
# Italian, and it's widely used in Mandarin (for whatever it means!).
# Running a script over the whole list of titles (and aliases), I've found
# that 'da' is used as an article only 23 times, and as another thing 298
# times, so I've decided to _always_ consider 'da' as a non article.
#
# Here is a list of words that are _never_ considered as articles, complete
# with the cound of times they are used in a way or another:
# 'en' (376 vs 594), 'to' (399 vs 727), 'as' (198 vs 276), 'et' (79 vs 99),
# 'des' (75 vs 150), 'al' (78 vs 304), 'ye' (14 vs 70),
# 'da' (23 vs 298), "'n" (8 vs 12)
#
# I've left in the list 'i' (1939 vs 2151) and 'uno' (52 vs 56)
# I'm not sure what '-al' is, and so I've left it out...
#
# Generic list of articles in utf-8 encoding:
GENERIC_ARTICLES = ('the', 'la', 'a', 'die', 'der', 'le', 'el',
"l'", 'il', 'das', 'les', 'i', 'o', 'ein', 'un', 'de', 'los',
'an', 'una', 'las', 'eine', 'den', 'het', 'gli', 'lo', 'os',
'ang', 'oi', 'az', 'een', 'ha-', 'det', 'ta', 'al-',
'mga', "un'", 'uno', 'ett', 'dem', 'egy', 'els', 'eines',
'\xc3\x8f', '\xc3\x87', '\xc3\x94\xc3\xaf', '\xc3\x8f\xc3\xa9')
# Lists of articles separated by language. If possible, the list should
# be sorted by frequency (not very important, but...)
# If you want to add a list of articles for another language, mail it
# it at imdbpy-devel@lists.sourceforge.net; non-ascii articles must be utf-8
# encoded.
LANG_ARTICLES = {
'English': ('the', 'a', 'an'),
'Italian': ('la', 'le', "l'", 'il', 'i', 'un', 'una', 'gli', 'lo', "un'",
'uno'),
'Spanish': ('la', 'le', 'el', 'les', 'un', 'los', 'una', 'uno', 'unos',
'unas'),
'Portuguese': ('a', 'as', 'o', 'os', 'um', 'uns', 'uma', 'umas'),
'Turkish': (), # Some languages doesn't have articles.
}
LANG_ARTICLESget = LANG_ARTICLES.get
# Maps a language to countries where it is the main language.
# If you want to add an entry for another language or country, mail it at
# imdbpy-devel@lists.sourceforge.net .
_LANG_COUNTRIES = {
'English': ('USA', 'UK', 'Canada', 'Ireland', 'Australia'),
'Italian': ('Italy',),
'Spanish': ('Spain', 'Mexico'),
'Portuguese': ('Portugal', 'Brazil'),
'Turkish': ('Turkey',),
#'German': ('Germany', 'East Germany', 'West Germany'),
#'French': ('France'),
}
# Maps countries to their main language.
COUNTRY_LANG = {}
for lang in _LANG_COUNTRIES:
for country in _LANG_COUNTRIES[lang]:
COUNTRY_LANG[country] = lang
def toUnicode(articles):
"""Convert a list of articles utf-8 encoded to unicode strings."""
return tuple([art.decode('utf_8') for art in articles])
def toDicts(articles):
"""Given a list of utf-8 encoded articles, build two dictionary (one
utf-8 encoded and another one with unicode keys) for faster matches."""
uArticles = toUnicode(articles)
return dict([(x, x) for x in articles]), dict([(x, x) for x in uArticles])
def addTrailingSpace(articles):
"""From the given list of utf-8 encoded articles, return two
lists (one utf-8 encoded and another one in unicode) where a space
is added at the end - if the last char is not ' or -."""
_spArticles = []
_spUnicodeArticles = []
for article in articles:
if article[-1] not in ("'", '-'):
article += ' '
_spArticles.append(article)
_spUnicodeArticles.append(article.decode('utf_8'))
return _spArticles, _spUnicodeArticles
# Caches.
_ART_CACHE = {}
_SP_ART_CACHE = {}
def articlesDictsForLang(lang):
"""Return dictionaries of articles specific for the given language, or the
default one if the language is not known."""
if lang in _ART_CACHE:
return _ART_CACHE[lang]
artDicts = toDicts(LANG_ARTICLESget(lang, GENERIC_ARTICLES))
_ART_CACHE[lang] = artDicts
return artDicts
def spArticlesForLang(lang):
"""Return lists of articles (plus optional spaces) specific for the
given language, or the default one if the language is not known."""
if lang in _SP_ART_CACHE:
return _SP_ART_CACHE[lang]
spArticles = addTrailingSpace(LANG_ARTICLESget(lang, GENERIC_ARTICLES))
_SP_ART_CACHE[lang] = spArticles
return spArticles
|