Overview
EFMSA ingests emails and uploads, paraphrases them, and indexes content for instant search. The system is PHP-first, using cron and MySQL FULLTEXT.
Extraction — PHP
<?php
require __DIR__ . '/vendor/autoload.php';
use PhpOffice\PhpWord\IOFactory as WordIO;
use League\CommonMark\CommonMarkConverter;
use Smalot\PdfParser\Parser as PdfParser;
function extract_text(string $path): array {
$ext = strtolower(pathinfo($path, PATHINFO_EXTENSION));
if ($ext === 'docx') {
$phpWord = WordIO::load($path);
$text = '';
foreach ($phpWord->getSections() as $section) {
foreach ($section->getElements() as $el) {
if (method_exists($el, 'getElements')) {
foreach ($el->getElements() as $inner) {
if (method_exists($inner, 'getText')) $text .= $inner->getText() . "\n";
}
} elseif (method_exists($el, 'getText')) {
$text .= $el->getText() . "\n";
}
}
}
return ['mime' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'text' => trim($text)];
}
if ($ext === 'md') {
$md = file_get_contents($path);
$converter = new CommonMarkConverter();
$html = $converter->convert($md)->getContent();
return ['mime' => 'text/markdown', 'text' => trim(strip_tags($html))];
}
if ($ext === 'pdf') {
$parser = new PdfParser();
$pdf = $parser->parseFile($path);
return ['mime' => 'application/pdf', 'text' => trim($pdf->getText())];
}
$raw = file_get_contents($path);
$txt = $ext === 'txt' ? $raw : strip_tags($raw);
return ['mime' => 'text/plain', 'text' => trim($txt)];
}