writing_with_film_subtitle_parser

#! /usr/bin/env python
# -*- coding: utf-8 -*-
import random
import re
from main.settings import db
import srt
import datetime
from pattern.en import parse

## Name of original video
videofile   = 'input/weekly-address.mp4'
srtfile     = 'input/subtitle.vtt'

def convert_time(timestring):
    """ Converts a string into seconds """
    nums = map(float, re.findall(r'\d+', timestring))
    return 3600*nums[0] + 60*nums[1] + nums[2] + nums[3]/1000

def parse_vtt (lines):
    timed_texts = []
    current_times , current_text = None, ""
    for line in lines:
        times = re.findall("[0-9]*:[0-9]*:[0-9]*\.[0-9]*", line)
        ## Found a timecode
        if times != []:
            current_times = map(convert_time, times)

        ## Only a line-break. Add text and timecode to times_texts
        elif re.match(r"^\r?\n$", line) and current_times is not None:
            timed_texts.append((current_times, re.sub(r'\s$', '', current_text)))
            current_times, current_text = None, ""

        ## Extend texts
        elif current_times is not None:
            current_text = current_text + re.sub(r'\r?\n',' ', line)

    return timed_texts

for row in parse_vtt(open(srtfile).readlines()):
    sentence = {
        'filename': videofile,
        'text': row[1],
        'start': row[0][0],
        'end': row[0][1],
        'duration': row[0][1] - row[0][0],
        'words': []
    }

    char_duration = sentence['duration'] / len(sentence['text'])
    offset = 0
    parsed_sentence = parse(sentence['text'], Tokenize=True, chunks=False)

    index = 0

    for chunk in parsed_sentence.split()[0]:
        word = chunk[0]
        tag = chunk[1]
        index = sentence['text'].find(word, index)

        word_start = index * char_duration
        word_duration = len(word) * char_duration
        word_end = word_start + word_duration

        sentence['words'].append({
            'word': word,
            'start': sentence['start'] + word_start,
            'end': sentence['start'] + word_end,
            'duration': word_duration,
            'tag': tag,
        })

    db.sentences.insert(sentence)