# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
import ast, markdown, re, json

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# a horizontal line to separate the deifferent comments of an issue
separator = '\n<p>&nbsp;</p>\n<hr>\n<p>&nbsp;</p>\n'

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
def concat(bodies):
    try:
        array = ast.literal_eval(bodies)
    except:
        print("bodies: ",bodies)
        raise
    return separator.join(array)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# format a list to allow mustache to read it
def list_for_mustache(list):
    list = ast.literal_eval(list)
    return json.dumps([{"data":v} for v in list])

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# replace #42 by a link to https://github.com/scikit-learn/scikit-learn/issues/42
# then format from markdown to html
re_issue_id = re.compile('#([0-9]{3,})')
def to_html(s):
    s = re_issue_id.sub('https://github.com/scikit-learn/scikit-learn/issues/\\1', str(s))
    return markdown.markdown(s.decode('utf8'))

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# load the input dataset
df = dataiku.Dataset("issues_cache").get_dataframe()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# call the above defined functions
df['tags_mustache'] = df['labels'].map(list_for_mustache)
#Removed comment bodies because we hit API limit
#comments_bodies = df['comments_bodies'].map(concat)
df['texts'] = df['body'].map(to_html) #+ separator + comments_bodies.map(to_html)
del df['body']

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# write the output
dataiku.Dataset("issues_with_comments_bodies").write_with_schema(df)