Skip to content

Commit

Permalink
index into algolia
Browse files Browse the repository at this point in the history
  • Loading branch information
paulvidal committed Aug 18, 2019
1 parent ea72658 commit 4f13cb7
Show file tree
Hide file tree
Showing 6 changed files with 165 additions and 1 deletion.
2 changes: 2 additions & 0 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
algoliasearch==2.0.4
ansicolors==1.1.8
asn1crypto==0.24.0
certifi==2019.3.9
Expand All @@ -16,6 +17,7 @@ itsdangerous==1.1.0
Jinja2==2.10.1
jwcrypto==0.6.0
MarkupSafe==1.1.1
mistune==0.8.4
pycparser==2.19
PyGithub==1.43.7
Pygments==2.4.2
Expand Down
Empty file added backend/search/__init__.py
Empty file.
18 changes: 18 additions & 0 deletions backend/search/client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import os

from algoliasearch.search_client import SearchClient

ALGOLIA_APP_ID = os.getenv("ALGOLIA_APP_ID")
ALGOLIA_ADMIN_KEY = os.getenv("ALGOLIA_ADMIN_KEY")
ALGOLIA_SEARCH_KEY = os.getenv("ALGOLIA_SEARCH_KEY")

INDEX_NAME = "test_DOCUMENTATION"

CLIENT = SearchClient.create(ALGOLIA_APP_ID, ALGOLIA_ADMIN_KEY)
INDEX = CLIENT.init_index(INDEX_NAME)


def insert_doc(doc):
INDEX.save_objects([
doc
])
133 changes: 133 additions & 0 deletions backend/search/markdown_indexer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import hashlib

import mistune
import re

from search import client


class CustomRenderer(mistune.Renderer):

def __init__(self, title, source):
super().__init__()

self.title = title
self.source = source

self.current_header = []
self.tables = []
self.lists = []

# Insert the document title
self._insert_content([], None, type='title')

def header(self, text, level, raw=None):
self._insert_header(text, level)
self._insert_content(self.current_header, None, type='header')

return super().header(text, level, raw)

def paragraph(self, text):
# Insert only if not an image or a link alone
if not re.compile('^<(img|a).*>$').search(text):
self._insert_content(self.current_header, text, type='paragraph')

return super().paragraph(text)

def _insert_content(self, headers, content, type):
doc = {
'source': self.source,
'type': type,
'title': self.title,
'link': self.title + ("#" + str(headers[0]['h']) if headers else ''),
'importance': len(headers) + (1 if content else 0)
}

for header in headers:
level = header['level']
h = header['h']

doc['h' + str(level)] = h

if content:
doc['content'] = content

# We hash the content of the file so we are sure not to index 2 times the same file
file_string = self.title + ''.join([h['h'] for h in headers]) + (content if content else '')
doc['objectID'] = hashlib.md5(file_string.encode("utf-8")).hexdigest()

client.insert_doc(doc)

def _insert_header(self, text, level):
if self.current_header:
if self.current_header[-1].get('level') == level:
self.current_header.pop()
self.current_header.append({
'h': text,
'level': level
})

elif self.current_header[-1].get('level') > level:
self.current_header.pop()
self._insert_header(text, level)

elif self.current_header[-1].get('level') < level:
self.current_header.append({
'h': text,
'level': level
})

else:
self.current_header.append({
'h': text,
'level': level
})

#
# NOT NEEDED
#

# TODO: we do not index code as hard to deal with it (blocks can be huge so hard to display them in the search)
def block_code(self, code, lang=None):
return super().block_code(code, lang)

# TODO: same problem with tables
def table(self, header, body):
return super().table(header, body)

# TODO: same problem with lists, although more manageable
def list(self, body, ordered=True):
return super().list(body, ordered)

def image(self, src, title, text):
return super().image(src, title, text)

def inline_html(self, html):
return super().inline_html(html)

def codespan(self, text): # inline code
return super().codespan(text)

def text(self, text):
return super().text(text)

def autolink(self, link, is_email=False):
return super().autolink(link, is_email)

def link(self, link, title, text):
return super().link(link, title, text)

def table_cell(self, content, **flags):
return super().table_cell(content, flags)

def table_row(self, content):
return super().table_row(content)

def list_item(self, text):
return super().list_item(text)


def insert_markdown_doc(source, title, content):
renderer = CustomRenderer(title, source)
markdown = mistune.Markdown(renderer=renderer)
markdown(content)
7 changes: 7 additions & 0 deletions backend/server/web_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from mongo.models.document import Document
from mongo.models.account_installation import AccountInstallation
from mongo.models.user import User
from search import markdown_indexer
from tools import logger
from utils import code_formatter
from utils.constants import SECRET_PASSWORD_FORGERY, CLIENT_ID, CLIENT_SECRET, REDIRECT_URL_LOGIN
Expand Down Expand Up @@ -115,6 +116,12 @@ def save(installation_account_login):
doc = Document.from_json(new_doc)
doc.insert()

markdown_indexer.insert_markdown_doc(
source='app',
title=doc.name,
content=doc.content
)

return __create_response({})


Expand Down
6 changes: 5 additions & 1 deletion set_prod_env.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
#!/usr/bin/env bash

export MONGO_URL="mongodb+srv://prod_user:[email protected]/test?retryWrites=true&w=majority"
export MONGO_URL="mongodb+srv://prod_user:[email protected]/test?retryWrites=true&w=majority"

export ALGOLIA_APP_ID="5CEV8V2XX3"
export ALGOLIA_SEARCH_KEY="6bd7125662303694213d4fbbffa9b882"
export ALGOLIA_ADMIN_KEY="cacc8468e45c3eb5c7e346c31f49407f"

0 comments on commit 4f13cb7

Please sign in to comment.