From 5decf7a17079d3c653f4a60372f73fa41cb89d84 Mon Sep 17 00:00:00 2001 From: Ilya Shakhat Date: Wed, 8 Oct 2014 16:12:27 +0400 Subject: [PATCH] Fix Unicode support in members retrieval method Change-Id: Ib9ad2350e49498fe42c2ab28e990af8bec238ebe --- stackalytics/processor/mps.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/stackalytics/processor/mps.py b/stackalytics/processor/mps.py index 83d9e9ff7..9daca56ba 100644 --- a/stackalytics/processor/mps.py +++ b/stackalytics/processor/mps.py @@ -12,8 +12,8 @@ # implied. # See the License for the specific language governing permissions and # limitations under the License. -import random +import random import re import time @@ -29,21 +29,16 @@ NAME_AND_DATE_PATTERN = r'

(?P[^<]*)[\s\S]*?' \ r'
(?P[^<]*)' COMPANY_PATTERN = r'Date\sJoined[\s\S]*?(?P[^<]*)' \ r'[\s\S]*?From\s(?P[\s\S]*?)\(Current\)' +GARBAGE_PATTERN = r'[/\\~%^\*_]+' -def _convert_str_fields_to_unicode(result): - for field, value in six.iteritems(result): - if type(value) is str: - try: - value = six.text_type(value, 'utf8') - result[field] = value - except Exception: - pass +def strip_garbage(s): + return re.sub(r'\s+', ' ', re.sub(GARBAGE_PATTERN, '', s)) def _retrieve_member(uri, member_id, html_parser): - content = utils.read_uri(uri) + content = six.text_type(utils.read_uri(uri), 'utf8') if not content: return {} @@ -54,7 +49,7 @@ def _retrieve_member(uri, member_id, html_parser): result = rec.groupdict() member['member_id'] = member_id - member['member_name'] = result['member_name'] + member['member_name'] = strip_garbage(result['member_name']) member['date_joined'] = result['date_joined'] member['member_uri'] = uri break @@ -63,7 +58,8 @@ def _retrieve_member(uri, member_id, html_parser): for rec in re.finditer(COMPANY_PATTERN, content): result = rec.groupdict() - member['company_draft'] = html_parser.unescape(result['company_draft']) + member['company_draft'] = strip_garbage( + html_parser.unescape(result['company_draft'])) return member @@ -99,8 +95,6 @@ def log(uri, runtime_storage_inst, days_to_update_members, members_look_ahead): cur_index += 1 continue - _convert_str_fields_to_unicode(member) - cnt_empty = 0 last_member_index = cur_index cur_index += 1