/** * Python bindings for docx_comment_parser using pybind11. * * Build command (example): * c++ -O2 -std=c++17 -fPIC -shared \ * -I../include $(python3-config --includes) \ * -I$(python3 -c "import pybind11; print(pybind11.get_include())") \ * python_bindings.cpp ../src/docx_parser.cpp ../src/batch_parser.cpp \ * ../src/zip_reader.cpp \ * -lz \ * -o docx_comment_parser$(python3-config --extension-suffix) * * Usage from Python: * import docx_comment_parser as dcp * parser = dcp.DocxParser() * parser.parse("my_file.docx") * for c in parser.comments(): * print(c.author, c.text) */ #include #include #include #include "docx_comment_parser.h" namespace py = pybind11; using namespace docx; PYBIND11_MODULE(docx_comment_parser, m) { m.doc() = R"doc( docx_comment_parser =================== Fast C++ library for extracting all comment metadata from .docx files. Classes ------- CommentRef â€“ lightweight reference to a related comment CommentMetadata â€“ all data extracted for one comment DocumentCommentStatsâ€“ document-level aggregate statistics DocxParser â€“ single-file parser BatchParser â€“ multi-file parallel parser )doc"; // â”€â”€ CommentRef â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€ py::class_(m, "CommentRef", "Lightweight reference to a related (replied-to) comment.") .def_readonly("id", &CommentRef::id, "Comment id (w:id attribute).") .def_readonly("author", &CommentRef::author, "Author name of the referenced comment.") .def_readonly("date", &CommentRef::date, "ISO-8601 date string of the referenced comment.") .def_readonly("text_snippet", &CommentRef::text_snippet, "First 120 characters of the referenced comment's text.") .def("__repr__", [](const CommentRef& r){ return ""; }); // â”€â”€ CommentMetadata â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€ py::class_(m, "CommentMetadata", R"doc( All metadata extracted for a single comment (w:comment element). Attributes ---------- id : int â€“ w:id author : str â€“ w:author date : str â€“ ISO-8601 date (as stored in XML) initials : str â€“ w:initials text : str â€“ full plain-text of comment body paragraph_style : str â€“ style of first paragraph inside comment range_start_para_id : str â€“ paraId of commentRangeStart (OOXML 2016+) range_end_para_id : str â€“ paraId of commentRangeEnd (OOXML 2016+) referenced_text : str â€“ document text anchored by this comment is_reply : bool â€“ True if this comment is a reply parent_id : int â€“ id of parent comment (-1 if root) replies : list[CommentRef] â€“ direct replies (on parent) para_id : str â€“ unique per-comment paragraph id para_id_parent : str â€“ parent paragraph id string done : bool â€“ resolved/done flag (OOXML 2016+) paragraph_index : int â€“ 0-based paragraph in document body run_index : int â€“ 0-based run within paragraph thread_ids : list[int] â€“ ordered ids of entire thread (root only) )doc") .def_readonly("id", &CommentMetadata::id) .def_readonly("author", &CommentMetadata::author) .def_readonly("date", &CommentMetadata::date) .def_readonly("initials", &CommentMetadata::initials) .def_readonly("text", &CommentMetadata::text) .def_readonly("paragraph_style", &CommentMetadata::paragraph_style) .def_readonly("range_start_para_id", &CommentMetadata::range_start_para_id) .def_readonly("range_end_para_id", &CommentMetadata::range_end_para_id) .def_readonly("referenced_text", &CommentMetadata::referenced_text) .def_readonly("is_reply", &CommentMetadata::is_reply) .def_readonly("parent_id", &CommentMetadata::parent_id) .def_readonly("replies", &CommentMetadata::replies) .def_readonly("para_id", &CommentMetadata::para_id) .def_readonly("para_id_parent", &CommentMetadata::para_id_parent) .def_readonly("done", &CommentMetadata::done) .def_readonly("paragraph_index", &CommentMetadata::paragraph_index) .def_readonly("run_index", &CommentMetadata::run_index) .def_readonly("thread_ids", &CommentMetadata::thread_ids) .def("to_dict", [](const CommentMetadata& m) { py::dict d; d["id"] = m.id; d["author"] = m.author; d["date"] = m.date; d["initials"] = m.initials; d["text"] = m.text; d["paragraph_style"] = m.paragraph_style; d["range_start_para_id"] = m.range_start_para_id; d["range_end_para_id"] = m.range_end_para_id; d["referenced_text"] = m.referenced_text; d["is_reply"] = m.is_reply; d["parent_id"] = m.parent_id; d["para_id"] = m.para_id; d["para_id_parent"] = m.para_id_parent; d["done"] = m.done; d["paragraph_index"] = m.paragraph_index; d["run_index"] = m.run_index; d["thread_ids"] = m.thread_ids; py::list replies; for (const auto& r : m.replies) { py::dict rd; rd["id"] = r.id; rd["author"] = r.author; rd["date"] = r.date; rd["text_snippet"] = r.text_snippet; replies.append(rd); } d["replies"] = replies; return d; }, "Return all metadata as a Python dict.") .def("__repr__", [](const CommentMetadata& m){ return ""; }); // â”€â”€ DocumentCommentStats â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€ py::class_(m, "DocumentCommentStats", R"doc( Document-level comment statistics. Attributes ---------- file_path : str total_comments : int total_resolved : int â€“ comments with done=True total_replies : int total_root_comments : int unique_authors : list[str] earliest_date : str â€“ ISO-8601 latest_date : str â€“ ISO-8601 )doc") .def_readonly("file_path", &DocumentCommentStats::file_path) .def_readonly("total_comments", &DocumentCommentStats::total_comments) .def_readonly("total_resolved", &DocumentCommentStats::total_resolved) .def_readonly("total_replies", &DocumentCommentStats::total_replies) .def_readonly("total_root_comments",&DocumentCommentStats::total_root_comments) .def_readonly("unique_authors", &DocumentCommentStats::unique_authors) .def_readonly("earliest_date", &DocumentCommentStats::earliest_date) .def_readonly("latest_date", &DocumentCommentStats::latest_date) .def("to_dict", [](const DocumentCommentStats& s){ py::dict d; d["file_path"] = s.file_path; d["total_comments"] = s.total_comments; d["total_resolved"] = s.total_resolved; d["total_replies"] = s.total_replies; d["total_root_comments"] = s.total_root_comments; d["unique_authors"] = s.unique_authors; d["earliest_date"] = s.earliest_date; d["latest_date"] = s.latest_date; return d; }, "Return stats as a Python dict.") .def("__repr__", [](const DocumentCommentStats& s){ return ""; }); // â”€â”€ DocxParser â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€ py::class_(m, "DocxParser", R"doc( Single-file .docx comment parser. Example ------- >>> import docx_comment_parser as dcp >>> p = dcp.DocxParser() >>> p.parse("report.docx") >>> for c in p.comments(): ... print(c.id, c.author, c.text[:60]) )doc") .def(py::init<>()) .def("parse", &DocxParser::parse, py::arg("file_path"), R"doc( Parse a .docx file and extract all comment metadata. Parameters ---------- file_path : str Absolute or relative path to the .docx file. Raises ------ DocxFileError if the file cannot be opened. DocxFormatError if required OOXML parts are missing or malformed. )doc") .def("comments", &DocxParser::comments, py::return_value_policy::reference_internal, "Return list of all CommentMetadata objects (sorted by id).") .def("stats", &DocxParser::stats, py::return_value_policy::reference_internal, "Return DocumentCommentStats for the parsed file.") .def("find_by_id", [](const DocxParser& self, int id) -> py::object { const CommentMetadata* m = self.find_by_id(id); if (!m) return py::none(); return py::cast(*m); }, py::arg("id"), "Return CommentMetadata for the given id, or None if not found.") .def("by_author", [](const DocxParser& self, const std::string& author){ auto ptrs = self.by_author(author); py::list result; for (auto* p : ptrs) result.append(*p); return result; }, py::arg("author"), "Return list of CommentMetadata authored by the given person.") .def("root_comments", [](const DocxParser& self){ auto ptrs = self.root_comments(); py::list result; for (auto* p : ptrs) result.append(*p); return result; }, "Return non-reply root comments in document order.") .def("thread", [](const DocxParser& self, int root_id){ auto ptrs = self.thread(root_id); py::list result; for (auto* p : ptrs) result.append(*p); return result; }, py::arg("root_id"), "Return ordered list of CommentMetadata forming the thread for root_id."); // â”€â”€ BatchParser â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€ py::class_(m, "BatchParser", R"doc( Multi-file parallel .docx comment parser. Example ------- >>> import docx_comment_parser as dcp, glob >>> bp = dcp.BatchParser(max_threads=4) >>> bp.parse_all(glob.glob("/docs/*.docx")) >>> for path in glob.glob("/docs/*.docx"): ... print(path, bp.stats(path).total_comments) >>> bp.release_all() )doc") .def(py::init(), py::arg("max_threads") = 0u, "Create a BatchParser. max_threads=0 uses all CPU cores.") .def("parse_all", &BatchParser::parse_all, py::arg("file_paths"), py::call_guard<:gil_scoped_release>(), R"doc( Parse a list of .docx files in parallel. Files that fail are recorded in errors() rather than raising. Parameters ---------- file_paths : list[str] )doc") .def("comments", [](const BatchParser& self, const std::string& fp) { return self.comments(fp); }, py::arg("file_path"), "Return list of CommentMetadata for a previously parsed file.") .def("stats", [](const BatchParser& self, const std::string& fp){ return self.stats(fp); }, py::arg("file_path"), "Return DocumentCommentStats for a previously parsed file.") .def("errors", [](const BatchParser& self){ py::dict d; for (const auto& kv : self.errors()) d[py::str(kv.first)] = kv.second; return d; }, "Return dict of {file_path: error_message} for files that failed.") .def("release", &BatchParser::release, py::arg("file_path"), "Free memory for a specific parsed file.") .def("release_all", &BatchParser::release_all, "Free memory for all parsed files."); // â”€â”€ Exception types â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€ py::register_exception (m, "DocxFileError", PyExc_IOError); py::register_exception(m, "DocxFormatError", PyExc_ValueError); py::register_exception(m, "DocxParserError", PyExc_RuntimeError); }