Gemfury

pclos / python3-imdbpy deb

Repository URL to install this package:
Details
python3-imdbpy / usr / lib / python3 / dist-packages / imdb / parser / http / personParser.py
# Copyright 2004-2020 Davide Alberani <da@erlug.linux.it>
#           2008-2018 H. Turgut Uyar <uyar@tekir.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

"""
This module provides the classes (and the instances) that are used to parse
the IMDb pages on the www.imdb.com server about a person.

For example, for "Mel Gibson" the referred pages would be:

categorized
    http://www.imdb.com/name/nm0000154/maindetails

biography
    http://www.imdb.com/name/nm0000154/bio

...and so on.
"""

from __future__ import absolute_import, division, print_function, unicode_literals

import re

from imdb.utils import analyze_name

from .movieParser import (
    DOMHTMLAwardsParser,
    DOMHTMLNewsParser,
    DOMHTMLOfficialsitesParser,
    DOMHTMLTechParser
)
from .piculet import Path, Rule, Rules, transformers
from .utils import DOMParserBase, analyze_imdbid, build_movie, build_person


_re_spaces = re.compile(r'\s+')
_reRoles = re.compile(r'(<li>.*? \.\.\.\. )(.*?)(</li>|<br>)', re.I | re.M | re.S)


class DOMHTMLMaindetailsParser(DOMParserBase):
    """Parser for the "categorized" (maindetails) page of a given person.
    The page should be provided as a string, as taken from
    the www.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example::

        cparser = DOMHTMLMaindetailsParser()
        result = cparser.parse(categorized_html_string)
    """
    _containsObjects = True
    _name_imdb_index = re.compile(r'\([IVXLCDM]+\)')

    _birth_rules = [
        Rule(
            key='birth date',
            extractor=Path('.//time[@itemprop="birthDate"]/@datetime')
        ),
        Rule(
            key='birth place',
            extractor=Path('.//a[starts-with(@href, "/search/name?birth_place=")]/text()')
        )
    ]

    _death_rules = [
        Rule(
            key='death date',
            extractor=Path('.//time[@itemprop="deathDate"]/@datetime')
        ),
        Rule(
            key='death place',
            extractor=Path('.//a[starts-with(@href, "/search/name?death_place=")]/text()')
        )
    ]

    _film_rules = [
        Rule(
            key='link',
            extractor=Path('./b/a[1]/@href')
        ),
        Rule(
            key='title',
            extractor=Path('./b/a[1]/text()')
        ),
        Rule(
            key='notes',
            extractor=Path('./b/following-sibling::text()')
        ),
        Rule(
            key='year',
            extractor=Path('./span[@class="year_column"]/text()')
        ),
        Rule(
            key='status',
            extractor=Path('./a[@class="in_production"]/text()')
        ),
        Rule(
            key='rolesNoChar',
            extractor=Path('.//br/following-sibling::text()')
        ),
        Rule(
            key='chrRoles',
            extractor=Path('./a[@imdbpyname]/@imdbpyname')
        )
    ]

    rules = [
        Rule(
            key='name',
            extractor=Path(
                '//h1[@class="header"]//text()',
                transform=lambda x: analyze_name(x)
            )
        ),
        Rule(
            key='name_index',
            extractor=Path('//h1[@class="header"]/span[1]/text()')
        ),
        Rule(
            key='birth info',
            extractor=Rules(
                section='//div[h4="Born:"]',
                rules=_birth_rules
            )
        ),
        Rule(
            key='death info',
            extractor=Rules(
                section='//div[h4="Died:"]',
                rules=_death_rules,
            )
        ),
        Rule(
            key='headshot',
            extractor=Path('//td[@id="img_primary"]//div[@class="image"]/a/img/@src')
        ),
        Rule(
            key='akas',
            extractor=Path(
                '//div[h4="Alternate Names:"]/text()',
                transform=lambda x: x.strip().split('  ')
            )
        ),
        Rule(
            key='filmography',
            extractor=Rules(
                foreach='//div[starts-with(@id, "filmo-head-")]',
                rules=[
                    Rule(
                        key=Path(
                            './a[@name]/text()',
                            transform=lambda x: x.lower().replace(': ', ' ')
                        ),
                        extractor=Rules(
                            foreach='./following-sibling::div[1]/div[starts-with(@class, "filmo-row")]',
                            rules=_film_rules,
                            transform=lambda x: build_movie(
                                x.get('title') or '',
                                year=x.get('year'),
                                movieID=analyze_imdbid(x.get('link') or ''),
                                rolesNoChar=(x.get('rolesNoChar') or '').strip(),
                                chrRoles=(x.get('chrRoles') or '').strip(),
                                additionalNotes=x.get('notes'),
                                status=x.get('status') or None
                            )
                        )
                    )
                ]
            )
        ),
        Rule(
            key='in development',
            extractor=Rules(
                foreach='//div[starts-with(@class,"devitem")]',
                rules=[
                    Rule(
                        key='link',
                        extractor=Path('./a/@href')
                    ),
                    Rule(
                        key='title',
                        extractor=Path('./a/text()')
                    )
                ],
                transform=lambda x: build_movie(
                    x.get('title') or '',
                    movieID=analyze_imdbid(x.get('link') or ''),
                    roleID=(x.get('roleID') or '').split('/'),
                    status=x.get('status') or None
                )
            )
        ),
        Rule(
            key='imdbID',
            extractor=Path('//meta[@property="pageId"]/@content',
                           transform=lambda x: (x or '').replace('nm', ''))
        )
    ]

    preprocessors = [
        ('<div class="clear"/> </div>', ''), ('<br/>', '<br />')
    ]

    def postprocess_data(self, data):
        filmo = {}
        for job in (data.get('filmography') or []):
            if not isinstance(job, dict) or not job:
                continue
            filmo.update(job)
        if filmo:
            data['filmography'] = filmo
        for key in ['name']:
            if (key in data) and isinstance(data[key], dict):
                subdata = data[key]
                del data[key]
                data.update(subdata)
        for what in 'birth date', 'death date':
            if what in data and not data[what]:
                del data[what]
        name_index = (data.get('name_index') or '').strip()
        if name_index:
            if self._name_imdb_index.match(name_index):
                data['imdbIndex'] = name_index[1:-1]
            del data['name_index']
        # XXX: the code below is for backwards compatibility
        # probably could be removed
        for key in list(data.keys()):
            if key == 'birth place':
                data['birth notes'] = data[key]
                del data[key]
            if key == 'death place':
                data['death notes'] = data[key]
                del data[key]
        return data


class DOMHTMLBioParser(DOMParserBase):
    """Parser for the "biography" page of a given person.
    The page should be provided as a string, as taken from
    the www.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example::

        bioparser = DOMHTMLBioParser()
        result = bioparser.parse(biography_html_string)
    """
    _defGetRefs = True

    _birth_rules = [
        Rule(
            key='birth date',
            extractor=Path(
                './time/@datetime',
                transform=lambda s: '%4d-%02d-%02d' % tuple(map(int, s.split('-')))
            )
        ),
        Rule(
            key='birth notes',
            extractor=Path('./a[starts-with(@href, "/search/name?birth_place=")]/text()')
        )
    ]

    _death_rules = [
        Rule(
            key='death date',
            extractor=Path(
                './time/@datetime',
                transform=lambda s: '%4d-%02d-%02d' % tuple(map(int, s.split('-')))
            )
        ),
        Rule(
            key='death cause',
            extractor=Path(
                './text()',
                transform=lambda x: ''.join(x).strip()[2:].lstrip()
            )
        ),
        Rule(
            key='death notes',
            extractor=Path(
                '..//text()',
                transform=lambda x: _re_spaces.sub(' ', (x or '').strip().split('\n')[-1])
            )
        )
    ]

    rules = [
        Rule(
            key='headshot',
            extractor=Path('//img[@class="poster"]/@src')
        ),
        Rule(
            key='birth info',
            extractor=Rules(
                section='//table[@id="overviewTable"]'
                        '//td[text()="Born"]/following-sibling::td[1]',
                rules=_birth_rules
            )
        ),
        Rule(
            key='death info',
            extractor=Rules(
                section='//table[@id="overviewTable"]'
                        '//td[text()="Died"]/following-sibling::td[1]',
                rules=_death_rules
            )
        ),
        Rule(
            key='nick names',
            extractor=Path(
                '//table[@id="overviewTable"]'
                '//td[starts-with(text(), "Nickname")]/following-sibling::td[1]/text()',
                reduce=lambda xs: '|'.join(xs),
                transform=lambda x: [
                    n.strip().replace(' (', '::(', 1)
                    for n in x.split('|') if n.strip()
                ]
            )
        ),
        Rule(
            key='birth name',
            extractor=Path(
                '//table[@id="overviewTable"]'
                '//td[text()="Birth Name"]/following-sibling::td[1]/text()',
                transform=lambda x: x.strip()
            )
        ),
        Rule(
            key='height',
            extractor=Path(
                '//table[@id="overviewTable"]'
                '//td[text()="Height"]/following-sibling::td[1]/text()',
                transform=transformers.strip
            )
        ),
        Rule(
            key='mini biography',
            extractor=Rules(
                foreach='//h4[starts-with(text(), "Mini Bio")]/following-sibling::div',
                rules=[
                    Rule(
                        key='bio',
                        extractor=Path('.//text()')
                    ),
                    Rule(
                        key='by',
                        extractor=Path('.//a[@name="ba"]//text()')
                    )
                ],
                transform=lambda x: "%s::%s" % (
                    (x.get('bio') or '').split('- IMDb Mini Biography By:')[0].strip(),
                    (x.get('by') or '').strip() or 'Anonymous'
                )
            )
        ),
        Rule(
            key='spouse',
            extractor=Rules(
                foreach='//a[@name="spouse"]/following::table[1]//tr',
                rules=[
                    Rule(
                        key='name',
                        extractor=Path('./td[1]//text()')
                    ),
                    Rule(
                        key='info',
                        extractor=Path('./td[2]//text()')
                    )
                ],
                transform=lambda x: ("%s::%s" % (
                    x.get('name').strip(),
                    (_re_spaces.sub(' ', x.get('info') or '')).strip())).strip(':')
            )
        ),
        Rule(
            key='trade mark',
            extractor=Path(
                foreach='//div[@class="_imdbpyh4"]/h4[starts-with(text(), "Trade Mark")]'
                        '/.././div[contains(@class, "soda")]',
                path='.//text()',
                transform=transformers.strip
            )
        ),
        Rule(
            key='trivia',
            extractor=Path(
                foreach='//div[@class="_imdbpyh4"]/h4[starts-with(text(), "Trivia")]'
                        '/.././div[contains(@class, "soda")]',
                path='.//text()',
                transform=transformers.strip
            )
        ),
        Rule(
            key='quotes',
            extractor=Path(
                foreach='//div[@class="_imdbpyh4"]/h4[starts-with(text(), "Personal Quotes")]'
                        '/.././div[contains(@class, "soda")]',
                path='.//text()',
                transform=transformers.strip
            )
        ),
        Rule(
            key='salary history',
            extractor=Rules(
                foreach='//a[@name="salary"]/following::table[1]//tr',
                rules=[
                    Rule(
                        key='title',
                        extractor=Path('./td[1]//text()')
                    ),
                    Rule(
                        key='info',
                        extractor=Path('./td[2]//text()')
                    )
                ],
                transform=lambda x: "%s::%s" % (
                    x.get('title').strip(),
                    _re_spaces.sub(' ', (x.get('info') or '')).strip())
            )
        )
    ]

    preprocessors = [
        (re.compile('(<h5>)', re.I), r'</div><div class="_imdbpy">\1'),
        (re.compile('(<h4)', re.I), r'</div><div class="_imdbpyh4">\1'),
        (re.compile('(</table>\n</div>\\s+)</div>', re.I + re.DOTALL), r'\1'),
        (re.compile('(<div id="tn15bot">)'), r'</div>\1'),
        (re.compile(r'\.<br><br>([^\s])', re.I), r'. \1')
    ]

    def postprocess_data(self, data):
        for key in ['birth info', 'death info']:
            if key in data and isinstance(data[key], dict):
                subdata = data[key]
                del data[key]
                data.update(subdata)
        for what in 'birth date', 'death date', 'death cause':
            if what in data and not data[what]:
                del data[what]
        return data


class DOMHTMLOtherWorksParser(DOMParserBase):
    """Parser for the "other works" page of a given person.
    The page should be provided as a string, as taken from
    the www.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example::

        owparser = DOMHTMLOtherWorksParser()
        result = owparser.parse(otherworks_html_string)
    """
    _defGetRefs = True

    rules = [
        Rule(
            key='other works',
            extractor=Path(
                foreach='//li[@class="ipl-zebra-list__item"]',
                path='.//text()',
                transform=transformers.strip
            )
        )
    ]


class DOMHTMLPersonGenresParser(DOMParserBase):
    """Parser for the "by genre" and "by keywords" pages of a given person.
    The page should be provided as a string, as taken from
    the www.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example::

        gparser = DOMHTMLPersonGenresParser()
        result = gparser.parse(bygenre_html_string)
    """
    kind = 'genres'
    _containsObjects = True

    rules = [
        Rule(
            key='genres',
            extractor=Rules(
                foreach='//b/a[@name]/following-sibling::a[1]',
                rules=[
                    Rule(
                        key=Path('./text()', transform=str.lower),
                        extractor=Rules(
                            foreach='../../following-sibling::ol[1]/li//a[1]',
                            rules=[
                                Rule(
                                    key='link',
                                    extractor=Path('./@href')
                                ),
                                Rule(
                                    key='title',
                                    extractor=Path('./text()')
                                ),
                                Rule(
                                    key='info',
                                    extractor=Path('./following-sibling::text()')
                                )
                            ],
                            transform=lambda x: build_movie(
                                x.get('title') + x.get('info').split('[')[0],
                                analyze_imdbid(x.get('link')))
                        )
                    )
                ]
            )
        )
    ]

    def postprocess_data(self, data):
        if len(data) == 0:
            return {}
        return {self.kind: data}

def _process_person_award(x):
    awards = {}
    movies = x.get('movies')
    year = x.get('year')
    result = x.get('result')
    prize = x.get('prize')
    category = x.get('category')
    award = x.get('award')
    sharedWith = x.get('shared with')

    if year:
        awards['year'] = int(year.strip())
    if result:
        awards['result'] = result.strip()
    if prize:
        awards['prize'] = prize.strip()
    if category:
        awards['category'] = category.strip()
    if movies:
        awards['movies'] = movies
    if award:
        awards['award'] = award.strip()
    if sharedWith:
        awards['shared with'] = sharedWith
    return awards


class DOMHTMLPersonAwardsParser(DOMParserBase):
    _defGetRefs = True

    rules = [
        Rule(
            key='awards',
            extractor=Rules(
                foreach='//table[@class="awards"]/tr',
                rules=[
                    Rule(
                        key='year',
                        extractor=Path('./td[@class="award_year"]/a/text()')
                    ),
                    Rule(
                        key='result',
                        extractor=Path('./td[@class="award_outcome"]/b/text()')
                    ),
                    Rule(
                        key='prize',
                        extractor=Path('.//span[@class="award_category"]/text()')
                    ),
                    Rule(
                        key='movies',
                        foreach='./td[@class="award_description"]/a',
                        extractor=Rules([
                            Rule(
                                key='title',
                                extractor=Path('./text()')
                            ),
                            Rule(
                                key='link',
                                extractor=Path('./@href')
                            ),
                            Rule(
                                key='year',
                                extractor=Path('./following-sibling::span[@class="title_year"][1]/text()')
                            )
                        ],
                        transform=lambda x: build_movie(
                            x.get('title') or '',
                            movieID=analyze_imdbid(x.get('link')),
                            year=x.get('year')
                        )
                    )),
                    Rule(
                        key='shared with',
                        foreach='./td[@class="award_description"]/div[@class="shared_with"]/following-sibling::ul//a',
                        extractor=Rules([
                            Rule(
                                key='name',
                                extractor=Path('./text()')
                            ),
                            Rule(
                                key='link',
                                extractor=Path('./@href')
                            )],
                            transform=lambda x: build_person(
                                x.get('name') or '',
                                personID=analyze_imdbid(x.get('link'))
                            )
                        )
                    ),
                    Rule(
                        key='category',
                        extractor=Path('./td[@class="award_description"]/text()')
                    ),
                    Rule(
                        key='award',
                        extractor=Path('../preceding-sibling::h3[1]/text()')
                    ),
                ],
                transform=_process_person_award
            )
        )
    ]


_OBJECTS = {
    'maindetails_parser': ((DOMHTMLMaindetailsParser,), None),
    'bio_parser': ((DOMHTMLBioParser,), None),
    'otherworks_parser': ((DOMHTMLOtherWorksParser,), None),
    'person_officialsites_parser': ((DOMHTMLOfficialsitesParser,), None),
    'person_awards_parser': ((DOMHTMLPersonAwardsParser,), None),
    'publicity_parser': ((DOMHTMLTechParser,), {'kind': 'publicity'}),
    'person_contacts_parser': ((DOMHTMLTechParser,), {'kind': 'contacts'}),
    'person_genres_parser': ((DOMHTMLPersonGenresParser,), None),
    'person_keywords_parser': ((DOMHTMLPersonGenresParser,), {'kind': 'keywords'}),
    'news_parser': ((DOMHTMLNewsParser,), None),
}
pclos / python3-imdbpy deb

Products

About

Resources

Contact Gemfury