[motherless] Fixed the broken uploader_id in the extractor (#31243)
* Fixed the broken uploader_id in the extractor. * Make uploader_id RE looser * Fix uploader_id in test Motherless_3 * Fix group pagination * # coding: utf-8 Co-authored-by: Andy Xuming <xuminic@gmail.com> Co-authored-by: dirkf <fieldhouse@gmx.net>
This commit is contained in:
parent
1b1442887e
commit
82e4eca711
1 changed files with 7 additions and 6 deletions
|
@ -1,3 +1,4 @@
|
||||||
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
|
@ -71,7 +72,7 @@ class MotherlessIE(InfoExtractor):
|
||||||
'title': 'a/ Hot Teens',
|
'title': 'a/ Hot Teens',
|
||||||
'categories': list,
|
'categories': list,
|
||||||
'upload_date': '20210104',
|
'upload_date': '20210104',
|
||||||
'uploader_id': 'yonbiw',
|
'uploader_id': 'anonymous',
|
||||||
'thumbnail': r're:https?://.*\.jpg',
|
'thumbnail': r're:https?://.*\.jpg',
|
||||||
'age_limit': 18,
|
'age_limit': 18,
|
||||||
},
|
},
|
||||||
|
@ -127,7 +128,7 @@ class MotherlessIE(InfoExtractor):
|
||||||
|
|
||||||
comment_count = webpage.count('class="media-comment-contents"')
|
comment_count = webpage.count('class="media-comment-contents"')
|
||||||
uploader_id = self._html_search_regex(
|
uploader_id = self._html_search_regex(
|
||||||
r'"thumb-member-username">\s+<a href="/m/([^"]+)"',
|
r'''(?s)['"](?:media-meta-member|thumb-member-username)\b[^>]+>\s*<a\b[^>]+\bhref\s*=\s*['"]/m/([^"']+)''',
|
||||||
webpage, 'uploader_id')
|
webpage, 'uploader_id')
|
||||||
|
|
||||||
categories = self._html_search_meta('keywords', webpage, default=None)
|
categories = self._html_search_meta('keywords', webpage, default=None)
|
||||||
|
@ -169,7 +170,7 @@ class MotherlessGroupIE(InfoExtractor):
|
||||||
'description': 'Sex can be funny. Wide smiles,laugh, games, fun of '
|
'description': 'Sex can be funny. Wide smiles,laugh, games, fun of '
|
||||||
'any kind!'
|
'any kind!'
|
||||||
},
|
},
|
||||||
'playlist_mincount': 9,
|
'playlist_mincount': 0,
|
||||||
}]
|
}]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -208,9 +209,9 @@ class MotherlessGroupIE(InfoExtractor):
|
||||||
r'<title>([\w\s]+\w)\s+-', webpage, 'title', fatal=False)
|
r'<title>([\w\s]+\w)\s+-', webpage, 'title', fatal=False)
|
||||||
description = self._html_search_meta(
|
description = self._html_search_meta(
|
||||||
'description', webpage, fatal=False)
|
'description', webpage, fatal=False)
|
||||||
page_count = self._int(self._search_regex(
|
page_count = str_to_int(self._search_regex(
|
||||||
r'(\d+)</(?:a|span)><(?:a|span)[^>]+>\s*NEXT',
|
r'(\d+)\s*</(?:a|span)>\s*<(?:a|span)[^>]+(?:>\s*NEXT|\brel\s*=\s*["\']?next)\b',
|
||||||
webpage, 'page_count'), 'page_count')
|
webpage, 'page_count', default='1'))
|
||||||
PAGE_SIZE = 80
|
PAGE_SIZE = 80
|
||||||
|
|
||||||
def _get_page(idx):
|
def _get_page(idx):
|
||||||
|
|
Loading…
Reference in a new issue